18 files changed, 1276 insertions, 261 deletions
diff --git a/Changelog b/Changelog
index 37d0cd0063..20110cec73 100644
--- a/Changelog
+++ b/Changelog
@@ -18,6 +18,7 @@ version <next>:
 - ffplay dynamic volume control
 - displace filter
 - selectivecolor filter
+- extensive native AAC encoder improvements
 
 
 version 2.8:
diff --git a/libavcodec/aac.h b/libavcodec/aac.h
index 17af49c766..37f98adb31 100644
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@@ -252,6 +252,7 @@ typedef struct SingleChannelElement {
     INTFLOAT sf[120];                               ///< scalefactors
     int sf_idx[128];                                ///< scalefactor indices (used by encoder)
     uint8_t zeroes[128];                            ///< band is not coded (used by encoder)
+    uint8_t can_pns[128];                           ///< band is allowed to PNS (informative)
     float  is_ener[128];                            ///< Intensity stereo pos (used by encoder)
     float pns_ener[128];                            ///< Noise energy values (used by encoder)
     DECLARE_ALIGNED(32, INTFLOAT, pcoeffs)[1024];   ///< coefficients for IMDCT, pristine
diff --git a/libavcodec/aaccoder.c b/libavcodec/aaccoder.c
index 10ea14b141..dafdc9fab8 100644
--- a/libavcodec/aaccoder.c
+++ b/libavcodec/aaccoder.c
@@ -33,7 +33,9 @@
 #include "libavutil/libm.h" // brought forward to work around cygwin header breakage
 
 #include <float.h>
+
 #include "libavutil/mathematics.h"
+#include "mathops.h"
 #include "avcodec.h"
 #include "put_bits.h"
 #include "aac.h"
@@ -50,9 +52,6 @@
 
 #include "libavcodec/aaccoder_twoloop.h"
 
-/** Frequency in Hz for lower limit of noise substitution **/
-#define NOISE_LOW_LIMIT 4000
-
 /* Parameter of f(x) = a*(lambda/100), defines the maximum fourier spread
  * beyond which no PNS is used (since the SFBs contain tone rather than noise) */
 #define NOISE_SPREAD_THRESHOLD 0.5073f
@@ -124,7 +123,7 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
                     rd += quantize_band_cost(s, &sce->coeffs[start + w*128],
                                              &s->scoefs[start + w*128], size,
                                              sce->sf_idx[(win+w)*16+swb], aac_cb_out_map[cb],
-                                             lambda / band->threshold, INFINITY, NULL, 0);
+                                             lambda / band->threshold, INFINITY, NULL, NULL, 0);
                 }
                 cost_stay_here = path[swb][cb].cost + rd;
                 cost_get_here  = minrd              + rd + run_bits + 4;
@@ -335,7 +334,7 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
                     for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
                         FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
                         dist += quantize_band_cost(s, coefs + w2*128, s->scoefs + start + w2*128, sce->ics.swb_sizes[g],
-                                                   q + q0, cb, lambda / band->threshold, INFINITY, NULL, 0);
+                                                   q + q0, cb, lambda / band->threshold, INFINITY, NULL, NULL, 0);
                     }
                     minrd = FFMIN(minrd, dist);
 
@@ -499,7 +498,7 @@ static void search_for_quantizers_faac(AVCodecContext *avctx, AACEncContext *s,
                                                ESC_BT,
                                                lambda,
                                                INFINITY,
-                                               &b,
+                                               &b, NULL,
                                                0);
                     dist -= b;
                 }
@@ -588,12 +587,36 @@ static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChanne
 {
     FFPsyBand *band;
     int w, g, w2, i;
+    int wlen = 1024 / sce->ics.num_windows;
+    int bandwidth, cutoff;
     float *PNS = &s->scoefs[0*128], *PNS34 = &s->scoefs[1*128];
     float *NOR34 = &s->scoefs[3*128];
     const float lambda = s->lambda;
-    const float freq_mult = avctx->sample_rate/(1024.0f/sce->ics.num_windows)/2.0f;
+    const float freq_mult = avctx->sample_rate*0.5f/wlen;
     const float thr_mult = NOISE_LAMBDA_REPLACE*(100.0f/lambda);
-    const float spread_threshold = NOISE_SPREAD_THRESHOLD*FFMAX(0.5f, lambda/100.f);
+    const float spread_threshold = FFMIN(0.75f, NOISE_SPREAD_THRESHOLD*FFMAX(0.5f, lambda/100.f));
+    const float dist_bias = av_clipf(4.f * 120 / lambda, 0.25f, 4.0f);
+    const float pns_transient_energy_r = FFMIN(0.7f, lambda / 140.f);
+
+    int refbits = avctx->bit_rate * 1024.0 / avctx->sample_rate
+        / ((avctx->flags & CODEC_FLAG_QSCALE) ? 2.0f : avctx->channels)
+        * (lambda / 120.f);
+
+    /** Keep this in sync with twoloop's cutoff selection */
+    float rate_bandwidth_multiplier = 1.5f;
+    int frame_bit_rate = (avctx->flags & CODEC_FLAG_QSCALE)
+        ? (refbits * rate_bandwidth_multiplier * avctx->sample_rate / 1024)
+        : (avctx->bit_rate / avctx->channels);
+
+    frame_bit_rate *= 1.15f;
+
+    if (avctx->cutoff > 0) {
+        bandwidth = avctx->cutoff;
+    } else {
+        bandwidth = FFMAX(3000, AAC_CUTOFF_FROM_BITRATE(frame_bit_rate, 1, avctx->sample_rate));
+    }
+
+    cutoff = bandwidth * 2 * wlen / avctx->sample_rate;
 
     memcpy(sce->band_alt, sce->band_type, sizeof(sce->band_type));
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
@@ -602,32 +625,44 @@ static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChanne
             int noise_sfi;
             float dist1 = 0.0f, dist2 = 0.0f, noise_amp;
             float pns_energy = 0.0f, pns_tgt_energy, energy_ratio, dist_thresh;
-            float sfb_energy = 0.0f, threshold = 0.0f, spread = 0.0f;
+            float sfb_energy = 0.0f, threshold = 0.0f, spread = 2.0f;
+            float min_energy = -1.0f, max_energy = 0.0f;
             const int start = wstart+sce->ics.swb_offset[g];
             const float freq = (start-wstart)*freq_mult;
             const float freq_boost = FFMAX(0.88f*freq/NOISE_LOW_LIMIT, 1.0f);
-            if (freq < NOISE_LOW_LIMIT || avctx->cutoff && freq >= avctx->cutoff)
+            if (freq < NOISE_LOW_LIMIT || (start-wstart) >= cutoff)
                 continue;
             for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
                 band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
                 sfb_energy += band->energy;
-                spread     += band->spread;
+                spread     = FFMIN(spread, band->spread);
                 threshold  += band->threshold;
+                if (!w2) {
+                    min_energy = max_energy = band->energy;
+                } else {
+                    min_energy = FFMIN(min_energy, band->energy);
+                    max_energy = FFMAX(max_energy, band->energy);
+                }
             }
 
             /* Ramps down at ~8000Hz and loosens the dist threshold */
-            dist_thresh = FFMIN(2.5f*NOISE_LOW_LIMIT/freq, 2.5f);
-
-            /* zero and energy close to threshold usually means hole avoidance,
-             * we do want to remain avoiding holes with PNS
+            dist_thresh = av_clipf(2.5f*NOISE_LOW_LIMIT/freq, 0.5f, 2.5f) * dist_bias;
+
+            /* PNS is acceptable when all of these are true:
+             * 1. high spread energy (noise-like band)
+             * 2. near-threshold energy (high PE means the random nature of PNS content will be noticed)
+             * 3. on short window groups, all windows have similar energy (variations in energy would be destroyed by PNS)
+             *
+             * At this stage, point 2 is relaxed for zeroed bands near the noise threshold (hole avoidance is more important)
              */
             if (((sce->zeroes[w*16+g] || !sce->band_alt[w*16+g]) && sfb_energy < threshold*sqrtf(1.5f/freq_boost)) || spread < spread_threshold ||
-                (sce->band_alt[w*16+g] && sfb_energy > threshold*thr_mult*freq_boost)) {
+                (!sce->zeroes[w*16+g] && sce->band_alt[w*16+g] && sfb_energy > threshold*thr_mult*freq_boost) ||
+                min_energy < pns_transient_energy_r * max_energy ) {
                 sce->pns_ener[w*16+g] = sfb_energy;
                 continue;
             }
 
-            pns_tgt_energy = sfb_energy*spread*spread/sce->ics.group_len[w];
+            pns_tgt_energy = sfb_energy*FFMIN(1.0f, spread*spread);
             noise_sfi = av_clip(roundf(log2f(pns_tgt_energy)*2), -100, 155); /* Quantize */
             noise_amp = -ff_aac_pow2sf_tab[noise_sfi + POW_SF2_ZERO];    /* Dequantize */
             for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
@@ -648,13 +683,18 @@ static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChanne
                                             sce->ics.swb_sizes[g],
                                             sce->sf_idx[(w+w2)*16+g],
                                             sce->band_alt[(w+w2)*16+g],
-                                            lambda/band->threshold, INFINITY, NULL, 0);
-                /* Estimate rd on average as 9 bits for CB and sf + spread energy * lambda/thr */
-                dist2 += 9+band->energy/(band->spread*band->spread)*lambda/band->threshold;
+                                            lambda/band->threshold, INFINITY, NULL, NULL, 0);
+                /* Estimate rd on average as 5 bits for SF, 4 for the CB, plus spread energy * lambda/thr */
+                dist2 += band->energy/(band->spread*band->spread)*lambda*dist_thresh/band->threshold;
+            }
+            if (g && sce->sf_idx[(w+w2)*16+g-1] == NOISE_BT) {
+                dist2 += 5;
+            } else {
+                dist2 += 9;
             }
             energy_ratio = pns_tgt_energy/pns_energy; /* Compensates for quantization error */
             sce->pns_ener[w*16+g] = energy_ratio*pns_tgt_energy;
-            if (energy_ratio > 0.85f && energy_ratio < 1.25f && (sce->zeroes[w*16+g] || !sce->band_alt[w*16+g] || dist2*dist_thresh < dist1)) {
+            if (sce->zeroes[w*16+g] || !sce->band_alt[w*16+g] || (energy_ratio > 0.85f && energy_ratio < 1.25f && dist2 < dist1)) {
                 sce->band_type[w*16+g] = NOISE_BT;
                 sce->zeroes[w*16+g] = 0;
             }
@@ -662,62 +702,203 @@ static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChanne
     }
 }
 
+static void mark_pns(AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce)
+{
+    FFPsyBand *band;
+    int w, g, w2;
+    int wlen = 1024 / sce->ics.num_windows;
+    int bandwidth, cutoff;
+    const float lambda = s->lambda;
+    const float freq_mult = avctx->sample_rate*0.5f/wlen;
+    const float spread_threshold = FFMIN(0.75f, NOISE_SPREAD_THRESHOLD*FFMAX(0.5f, lambda/100.f));
+    const float pns_transient_energy_r = FFMIN(0.7f, lambda / 140.f);
+
+    int refbits = avctx->bit_rate * 1024.0 / avctx->sample_rate
+        / ((avctx->flags & CODEC_FLAG_QSCALE) ? 2.0f : avctx->channels)
+        * (lambda / 120.f);
+
+    /** Keep this in sync with twoloop's cutoff selection */
+    float rate_bandwidth_multiplier = 1.5f;
+    int frame_bit_rate = (avctx->flags & CODEC_FLAG_QSCALE)
+        ? (refbits * rate_bandwidth_multiplier * avctx->sample_rate / 1024)
+        : (avctx->bit_rate / avctx->channels);
+
+    frame_bit_rate *= 1.15f;
+
+    if (avctx->cutoff > 0) {
+        bandwidth = avctx->cutoff;
+    } else {
+        bandwidth = FFMAX(3000, AAC_CUTOFF_FROM_BITRATE(frame_bit_rate, 1, avctx->sample_rate));
+    }
+
+    cutoff = bandwidth * 2 * wlen / avctx->sample_rate;
+
+    memcpy(sce->band_alt, sce->band_type, sizeof(sce->band_type));
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            float sfb_energy = 0.0f, threshold = 0.0f, spread = 2.0f;
+            float min_energy = -1.0f, max_energy = 0.0f;
+            const int start = sce->ics.swb_offset[g];
+            const float freq = start*freq_mult;
+            const float freq_boost = FFMAX(0.88f*freq/NOISE_LOW_LIMIT, 1.0f);
+            if (freq < NOISE_LOW_LIMIT || start >= cutoff) {
+                sce->can_pns[w*16+g] = 0;
+                continue;
+            }
+            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                sfb_energy += band->energy;
+                spread     = FFMIN(spread, band->spread);
+                threshold  += band->threshold;
+                if (!w2) {
+                    min_energy = max_energy = band->energy;
+                } else {
+                    min_energy = FFMIN(min_energy, band->energy);
+                    max_energy = FFMAX(max_energy, band->energy);
+                }
+            }
+
+            /* PNS is acceptable when all of these are true:
+             * 1. high spread energy (noise-like band)
+             * 2. near-threshold energy (high PE means the random nature of PNS content will be noticed)
+             * 3. on short window groups, all windows have similar energy (variations in energy would be destroyed by PNS)
+             */
+            sce->pns_ener[w*16+g] = sfb_energy;
+            if (sfb_energy < threshold*sqrtf(1.5f/freq_boost) || spread < spread_threshold || min_energy < pns_transient_energy_r * max_energy) {
+                sce->can_pns[w*16+g] = 0;
+            } else {
+                sce->can_pns[w*16+g] = 1;
+            }
+        }
+    }
+}
+
 static void search_for_ms(AACEncContext *s, ChannelElement *cpe)
 {
-    int start = 0, i, w, w2, g;
+    int start = 0, i, w, w2, g, sid_sf_boost;
     float M[128], S[128];
     float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
     const float lambda = s->lambda;
+    const float mslambda = FFMIN(1.0f, lambda / 120.f);
     SingleChannelElement *sce0 = &cpe->ch[0];
     SingleChannelElement *sce1 = &cpe->ch[1];
     if (!cpe->common_window)
         return;
     for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
+        int min_sf_idx_mid = SCALE_MAX_POS;
+        int min_sf_idx_side = SCALE_MAX_POS;
+        for (g = 0; g < sce0->ics.num_swb; g++) {
+            if (!sce0->zeroes[w*16+g] && sce0->band_type[w*16+g] < RESERVED_BT)
+                min_sf_idx_mid = FFMIN(min_sf_idx_mid, sce0->sf_idx[w*16+g]);
+            if (!sce1->zeroes[w*16+g] && sce1->band_type[w*16+g] < RESERVED_BT)
+                min_sf_idx_side = FFMIN(min_sf_idx_side, sce1->sf_idx[w*16+g]);
+        }
+
         start = 0;
         for (g = 0;  g < sce0->ics.num_swb; g++) {
+            float bmax = bval2bmax(g * 17.0f / sce0->ics.num_swb) / 0.0045f;
+            cpe->ms_mask[w*16+g] = 0;
             if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
-                float dist1 = 0.0f, dist2 = 0.0f;
+                float Mmax = 0.0f, Smax = 0.0f;
+
+                /* Must compute mid/side SF and book for the whole window group */
                 for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
-                    FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
-                    FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
-                    float minthr = FFMIN(band0->threshold, band1->threshold);
-                    float maxthr = FFMAX(band0->threshold, band1->threshold);
                     for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
                         M[i] = (sce0->coeffs[start+(w+w2)*128+i]
                               + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
                         S[i] =  M[i]
                               - sce1->coeffs[start+(w+w2)*128+i];
                     }
-                    abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
-                    dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
-                                                L34,
-                                                sce0->ics.swb_sizes[g],
-                                                sce0->sf_idx[(w+w2)*16+g],
-                                                sce0->band_type[(w+w2)*16+g],
-                                                lambda / band0->threshold, INFINITY, NULL, 0);
-                    dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
-                                                R34,
-                                                sce1->ics.swb_sizes[g],
-                                                sce1->sf_idx[(w+w2)*16+g],
-                                                sce1->band_type[(w+w2)*16+g],
-                                                lambda / band1->threshold, INFINITY, NULL, 0);
-                    dist2 += quantize_band_cost(s, M,
-                                                M34,
-                                                sce0->ics.swb_sizes[g],
-                                                sce0->sf_idx[(w+w2)*16+g],
-                                                sce0->band_type[(w+w2)*16+g],
-                                                lambda / maxthr, INFINITY, NULL, 0);
-                    dist2 += quantize_band_cost(s, S,
-                                                S34,
-                                                sce1->ics.swb_sizes[g],
-                                                sce1->sf_idx[(w+w2)*16+g],
-                                                sce1->band_type[(w+w2)*16+g],
-                                                lambda / minthr, INFINITY, NULL, 0);
+                    abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]);
+                    abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]);
+                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++ ) {
+                        Mmax = FFMAX(Mmax, M34[i]);
+                        Smax = FFMAX(Smax, S34[i]);
+                    }
+                }
+
+                for (sid_sf_boost = 0; sid_sf_boost < 4; sid_sf_boost++) {
+                    float dist1 = 0.0f, dist2 = 0.0f;
+                    int B0 = 0, B1 = 0;
+                    int minidx;
+                    int mididx, sididx;
+                    int midcb, sidcb;
+
+                    minidx = FFMIN(sce0->sf_idx[w*16+g], sce1->sf_idx[w*16+g]);
+                    mididx = av_clip(minidx, min_sf_idx_mid, min_sf_idx_mid + SCALE_MAX_DIFF);
+                    sididx = av_clip(minidx - sid_sf_boost * 3, min_sf_idx_side, min_sf_idx_side + SCALE_MAX_DIFF);
+                    midcb = find_min_book(Mmax, mididx);
+                    sidcb = find_min_book(Smax, sididx);
+
+                    if ((mididx > minidx) || (sididx > minidx)) {
+                        /* scalefactor range violation, bad stuff, will decrease quality unacceptably */
+                        continue;
+                    }
+
+                    /* No CB can be zero */
+                    midcb = FFMAX(1,midcb);
+                    sidcb = FFMAX(1,sidcb);
+
+                    for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                        FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
+                        FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
+                        float minthr = FFMIN(band0->threshold, band1->threshold);
+                        int b1,b2,b3,b4;
+                        for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                            M[i] = (sce0->coeffs[start+(w+w2)*128+i]
+                                  + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
+                            S[i] =  M[i]
+                                  - sce1->coeffs[start+(w+w2)*128+i];
+                        }
+
+                        abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
+                        dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
+                                                    L34,
+                                                    sce0->ics.swb_sizes[g],
+                                                    sce0->sf_idx[(w+w2)*16+g],
+                                                    sce0->band_type[(w+w2)*16+g],
+                                                    lambda / band0->threshold, INFINITY, &b1, NULL, 0);
+                        dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
+                                                    R34,
+                                                    sce1->ics.swb_sizes[g],
+                                                    sce1->sf_idx[(w+w2)*16+g],
+                                                    sce1->band_type[(w+w2)*16+g],
+                                                    lambda / band1->threshold, INFINITY, &b2, NULL, 0);
+                        dist2 += quantize_band_cost(s, M,
+                                                    M34,
+                                                    sce0->ics.swb_sizes[g],
+                                                    sce0->sf_idx[(w+w2)*16+g],
+                                                    sce0->band_type[(w+w2)*16+g],
+                                                    lambda / minthr, INFINITY, &b3, NULL, 0);
+                        dist2 += quantize_band_cost(s, S,
+                                                    S34,
+                                                    sce1->ics.swb_sizes[g],
+                                                    sce1->sf_idx[(w+w2)*16+g],
+                                                    sce1->band_type[(w+w2)*16+g],
+                                                    mslambda / (minthr * bmax), INFINITY, &b4, NULL, 0);
+                        B0 += b1+b2;
+                        B1 += b3+b4;
+                        dist1 -= B0;
+                        dist2 -= B1;
+                    }
+                    cpe->ms_mask[w*16+g] = dist2 <= dist1 && B1 < B0;
+                    if (cpe->ms_mask[w*16+g]) {
+                        /* Setting the M/S mask is useful with I/S, but only the flag */
+                        if (!cpe->is_mask[w*16+g]) {
+                            sce0->sf_idx[w*16+g] = mididx;
+                            sce1->sf_idx[w*16+g] = sididx;
+                            sce0->band_type[w*16+g] = midcb;
+                            sce1->band_type[w*16+g] = sidcb;
+                        }
+                        break;
+                    } else if (B1 > B0) {
+                        /* More boost won't fix this */
+                        break;
+                    }
                 }
-                cpe->ms_mask[w*16+g] = dist2 < dist1;
             }
             start += sce0->ics.swb_sizes[g];
         }
@@ -736,6 +917,7 @@ AACCoefficientsEncoder ff_aac_coders[AAC_CODER_NB] = {
         ff_aac_apply_tns,
         set_special_band_scalefactors,
         search_for_pns,
+        mark_pns,
         ff_aac_search_for_tns,
         search_for_ms,
         ff_aac_search_for_is,
@@ -752,6 +934,7 @@ AACCoefficientsEncoder ff_aac_coders[AAC_CODER_NB] = {
         ff_aac_apply_tns,
         set_special_band_scalefactors,
         search_for_pns,
+        mark_pns,
         ff_aac_search_for_tns,
         search_for_ms,
         ff_aac_search_for_is,
@@ -768,6 +951,7 @@ AACCoefficientsEncoder ff_aac_coders[AAC_CODER_NB] = {
         ff_aac_apply_tns,
         set_special_band_scalefactors,
         search_for_pns,
+        mark_pns,
         ff_aac_search_for_tns,
         search_for_ms,
         ff_aac_search_for_is,
@@ -784,6 +968,7 @@ AACCoefficientsEncoder ff_aac_coders[AAC_CODER_NB] = {
         ff_aac_apply_tns,
         set_special_band_scalefactors,
         search_for_pns,
+        mark_pns,
         ff_aac_search_for_tns,
         search_for_ms,
         ff_aac_search_for_is,
diff --git a/libavcodec/aaccoder_trellis.h b/libavcodec/aaccoder_trellis.h
index 7d685ebe8c..6187692479 100644
--- a/libavcodec/aaccoder_trellis.h
+++ b/libavcodec/aaccoder_trellis.h
@@ -129,7 +129,7 @@ static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
                                                &s->scoefs[start + w*128], size,
                                                sce->sf_idx[win*16+swb],
                                                aac_cb_out_map[cb],
-                                               0, INFINITY, NULL, 0);
+                                               0, INFINITY, NULL, NULL, 0);
                 }
                 cost_stay_here = path[swb][cb].cost + bits;
                 cost_get_here  = minbits            + bits + run_bits + 4;
diff --git a/libavcodec/aaccoder_twoloop.h b/libavcodec/aaccoder_twoloop.h
index 5ac09dc9cc..21a4aed6cb 100644
--- a/libavcodec/aaccoder_twoloop.h
+++ b/libavcodec/aaccoder_twoloop.h
@@ -22,7 +22,7 @@
 /**
  * @file
  * AAC encoder twoloop coder
- * @author Konstantin Shishkov
+ * @author Konstantin Shishkov, Claudio Freire
  */
 
 /**
@@ -34,6 +34,7 @@
  *  - abs_pow34_v
  *  - find_max_val
  *  - find_min_book
+ *  - find_form_factor
  */
 
 #ifndef AVCODEC_AACCODER_TWOLOOP_H
@@ -41,6 +42,7 @@
 
 #include <float.h>
 #include "libavutil/mathematics.h"
+#include "mathops.h"
 #include "avcodec.h"
 #include "put_bits.h"
 #include "aac.h"
@@ -49,6 +51,20 @@
 #include "aacenctab.h"
 #include "aac_tablegen_decl.h"
 
+/** Frequency in Hz for lower limit of noise substitution **/
+#define NOISE_LOW_LIMIT 4000
+
+#define sclip(x) av_clip(x,60,218)
+
+
+static av_always_inline int ff_pns_bits(const SingleChannelElement *sce, int w, int g)
+{
+    if (!g || !sce->zeroes[w*16+g-1] || !sce->can_pns[w*16+g-1]) {
+        return 9;
+    } else {
+        return 5;
+    }
+}
 
 /**
  * two-loop quantizers search taken from ISO 13818-7 Appendix C
@@ -58,51 +74,219 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
                                           SingleChannelElement *sce,
                                           const float lambda)
 {
-    int start = 0, i, w, w2, g;
-    int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels * (lambda / 120.f);
-    float dists[128] = { 0 }, uplims[128] = { 0 };
-    float maxvals[128];
-    int fflag, minscaler;
+    int start = 0, i, w, w2, g, recomprd;
+    int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate
+        / ((avctx->flags & CODEC_FLAG_QSCALE) ? 2.0f : avctx->channels)
+        * (lambda / 120.f);
+    int refbits = destbits;
+    int toomanybits, toofewbits;
+    char nzs[128];
+    int maxsf[128];
+    float dists[128] = { 0 }, qenergies[128] = { 0 }, uplims[128], euplims[128], energies[128];
+    float maxvals[128], spread_thr_r[128];
+    float min_spread_thr_r, max_spread_thr_r;
+
+    /**
+     * rdlambda controls the maximum tolerated distortion. Twoloop
+     * will keep iterating until it fails to lower it or it reaches
+     * ulimit * rdlambda. Keeping it low increases quality on difficult
+     * signals, but lower it too much, and bits will be taken from weak
+     * signals, creating "holes". A balance is necesary.
+     * rdmax and rdmin specify the relative deviation from rdlambda
+     * allowed for tonality compensation
+     */
+    float rdlambda = av_clipf(2.0f * 120.f / lambda, 0.0625f, 16.0f);
+    const float nzslope = 1.5f;
+    float rdmin = 0.03125f;
+    float rdmax = 1.0f;
+
+    /**
+     * sfoffs controls an offset of optmium allocation that will be
+     * applied based on lambda. Keep it real and modest, the loop
+     * will take care of the rest, this just accelerates convergence
+     */
+    float sfoffs = av_clipf(log2f(120.0f / lambda) * 4.0f, -5, 10);
+
+    int fflag, minscaler, maxscaler, nminscaler, minrdsf;
     int its  = 0;
+    int maxits = 30;
     int allz = 0;
-    float minthr = INFINITY;
+    int tbits;
+    int cutoff = 1024;
+    int pns_start_pos;
+
+    /**
+     * zeroscale controls a multiplier of the threshold, if band energy
+     * is below this, a zero is forced. Keep it lower than 1, unless
+     * low lambda is used, because energy < threshold doesn't mean there's
+     * no audible signal outright, it's just energy. Also make it rise
+     * slower than rdlambda, as rdscale has due compensation with
+     * noisy band depriorization below, whereas zeroing logic is rather dumb
+     */
+    float zeroscale;
+    if (lambda > 120.f) {
+        zeroscale = av_clipf(powf(120.f / lambda, 0.25f), 0.0625f, 1.0f);
+    } else {
+        zeroscale = 1.f;
+    }
+
+    if (s->psy.bitres.alloc >= 0) {
+        /**
+         * Psy granted us extra bits to use, from the reservoire
+         * adjust for lambda except what psy already did
+         */
+        destbits = s->psy.bitres.alloc
+            * (lambda / (avctx->global_quality ? avctx->global_quality : 120));
+    }
+
+    if (avctx->flags & CODEC_FLAG_QSCALE) {
+        /**
+         * Constant Q-scale doesn't compensate MS coding on its own
+         * No need to be overly precise, this only controls RD
+         * adjustment CB limits when going overboard
+         */
+        if (s->options.stereo_mode && s->cur_type == TYPE_CPE)
+            destbits *= 2;
+
+        /**
+         * When using a constant Q-scale, don't adjust bits, just use RD
+         * Don't let it go overboard, though... 8x psy target is enough
+         */
+        toomanybits = 5800;
+        toofewbits = destbits / 16;
+
+        /** Don't offset scalers, just RD */
+        sfoffs = sce->ics.num_windows - 1;
+        rdlambda = sqrtf(rdlambda);
+
+        /** search further */
+        maxits *= 2;
+    } else {
+        /** When using ABR, be strict */
+        toomanybits = destbits + destbits/16;
+        toofewbits = destbits - destbits/4;
+
+        sfoffs = 0;
+        rdlambda = sqrtf(rdlambda);
+    }
+
+    /** and zero out above cutoff frequency */
+    {
+        int wlen = 1024 / sce->ics.num_windows;
+        int bandwidth;
+
+        /**
+         * Scale, psy gives us constant quality, this LP only scales
+         * bitrate by lambda, so we save bits on subjectively unimportant HF
+         * rather than increase quantization noise. Adjust nominal bitrate
+         * to effective bitrate according to encoding parameters,
+         * AAC_CUTOFF_FROM_BITRATE is calibrated for effective bitrate.
+         */
+        float rate_bandwidth_multiplier = 1.5f;
+        int frame_bit_rate = (avctx->flags & CODEC_FLAG_QSCALE)
+            ? (refbits * rate_bandwidth_multiplier * avctx->sample_rate / 1024)
+            : (avctx->bit_rate / avctx->channels);
+
+        /** Compensate for extensions that increase efficiency */
+        if (s->options.pns || s->options.intensity_stereo)
+            frame_bit_rate *= 1.15f;
 
-    // for values above this the decoder might end up in an endless loop
-    // due to always having more bits than what can be encoded.
+        if (avctx->cutoff > 0) {
+            bandwidth = avctx->cutoff;
+        } else {
+            bandwidth = FFMAX(3000, AAC_CUTOFF_FROM_BITRATE(frame_bit_rate, 1, avctx->sample_rate));
+        }
+
+        cutoff = bandwidth * 2 * wlen / avctx->sample_rate;
+        pns_start_pos = NOISE_LOW_LIMIT * 2 * wlen / avctx->sample_rate;
+    }
+
+    /**
+     * for values above this the decoder might end up in an endless loop
+     * due to always having more bits than what can be encoded.
+     */
     destbits = FFMIN(destbits, 5800);
-    //XXX: some heuristic to determine initial quantizers will reduce search time
-    //determine zero bands and upper limits
+    toomanybits = FFMIN(toomanybits, 5800);
+    toofewbits = FFMIN(toofewbits, 5800);
+    /**
+     * XXX: some heuristic to determine initial quantizers will reduce search time
+     * determine zero bands and upper distortion limits
+     */
+    min_spread_thr_r = -1;
+    max_spread_thr_r = -1;
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        for (g = 0;  g < sce->ics.num_swb; g++) {
+        for (g = start = 0;  g < sce->ics.num_swb; start += sce->ics.swb_sizes[g++]) {
             int nz = 0;
-            float uplim = 0.0f, energy = 0.0f;
+            float uplim = 0.0f, energy = 0.0f, spread = 0.0f;
             for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
                 FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
-                uplim  += band->threshold;
-                energy += band->energy;
-                if (band->energy <= band->threshold || band->threshold == 0.0f) {
+                if (start >= cutoff || band->energy <= (band->threshold * zeroscale) || band->threshold == 0.0f) {
                     sce->zeroes[(w+w2)*16+g] = 1;
                     continue;
                 }
                 nz = 1;
             }
-            uplims[w*16+g] = uplim *512;
+            if (!nz) {
+                uplim = 0.0f;
+            } else {
+                nz = 0;
+                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                    FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                    if (band->energy <= (band->threshold * zeroscale) || band->threshold == 0.0f)
+                        continue;
+                    uplim += band->threshold;
+                    energy += band->energy;
+                    spread += band->spread;
+                    nz++;
+                }
+            }
+            uplims[w*16+g] = uplim;
+            energies[w*16+g] = energy;
+            nzs[w*16+g] = nz;
             sce->zeroes[w*16+g] = !nz;
-            if (nz)
-                minthr = FFMIN(minthr, uplim);
             allz |= nz;
+            if (nz) {
+                spread_thr_r[w*16+g] = energy * nz / (uplim * spread);
+                if (min_spread_thr_r < 0) {
+                    min_spread_thr_r = max_spread_thr_r = spread_thr_r[w*16+g];
+                } else {
+                    min_spread_thr_r = FFMIN(min_spread_thr_r, spread_thr_r[w*16+g]);
+                    max_spread_thr_r = FFMAX(max_spread_thr_r, spread_thr_r[w*16+g]);
+                }
+            }
         }
     }
+
+    /** Compute initial scalers */
+    minscaler = 65535;
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
         for (g = 0;  g < sce->ics.num_swb; g++) {
             if (sce->zeroes[w*16+g]) {
                 sce->sf_idx[w*16+g] = SCALE_ONE_POS;
                 continue;
             }
-            sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
+            /**
+             * log2f-to-distortion ratio is, technically, 2 (1.5db = 4, but it's power vs level so it's 2).
+             * But, as offsets are applied, low-frequency signals are too sensitive to the induced distortion,
+             * so we make scaling more conservative by choosing a lower log2f-to-distortion ratio, and thus
+             * more robust.
+             */
+            sce->sf_idx[w*16+g] = av_clip(
+                SCALE_ONE_POS
+                    + 1.75*log2f(FFMAX(0.00125f,uplims[w*16+g]) / sce->ics.swb_sizes[g])
+                    + sfoffs,
+                60, SCALE_MAX_POS);
+            minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
         }
     }
 
+    /** Clip */
+    minscaler = av_clip(minscaler, SCALE_ONE_POS - SCALE_DIV_512, SCALE_MAX_POS - SCALE_DIV_512);
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w])
+        for (g = 0;  g < sce->ics.num_swb; g++)
+            if (!sce->zeroes[w*16+g])
+                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF - 1);
+
     if (!allz)
         return;
     abs_pow34_v(s->scoefs, sce->coeffs, 1024);
@@ -116,15 +300,66 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
         }
     }
 
+    /**
+     * Scale uplims to match rate distortion to quality
+     * bu applying noisy band depriorization and tonal band priorization.
+     * Maxval-energy ratio gives us an idea of how noisy/tonal the band is.
+     * If maxval^2 ~ energy, then that band is mostly noise, and we can relax
+     * rate distortion requirements.
+     */
+    memcpy(euplims, uplims, sizeof(euplims));
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        /** psy already priorizes transients to some extent */
+        float de_psy_factor = (sce->ics.num_windows > 1) ? 8.0f / sce->ics.group_len[w] : 1.0f;
+        start = w*128;
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            if (nzs[g] > 0) {
+                float cleanup_factor = ff_sqrf(av_clipf(start / (cutoff * 0.75f), 1.0f, 2.0f));
+                float energy2uplim = find_form_factor(
+                    sce->ics.group_len[w], sce->ics.swb_sizes[g],
+                    uplims[w*16+g] / (nzs[g] * sce->ics.swb_sizes[w]),
+                    sce->coeffs + start,
+                    nzslope * cleanup_factor);
+                energy2uplim *= de_psy_factor;
+                if (!(avctx->flags & CODEC_FLAG_QSCALE)) {
+                    /** In ABR, we need to priorize less and let rate control do its thing */
+                    energy2uplim = sqrtf(energy2uplim);
+                }
+                energy2uplim = FFMAX(0.015625f, FFMIN(1.0f, energy2uplim));
+                uplims[w*16+g] *= av_clipf(rdlambda * energy2uplim, rdmin, rdmax)
+                                  * sce->ics.group_len[w];
+
+                energy2uplim = find_form_factor(
+                    sce->ics.group_len[w], sce->ics.swb_sizes[g],
+                    uplims[w*16+g] / (nzs[g] * sce->ics.swb_sizes[w]),
+                    sce->coeffs + start,
+                    2.0f);
+                energy2uplim *= de_psy_factor;
+                if (!(avctx->flags & CODEC_FLAG_QSCALE)) {
+                    /** In ABR, we need to priorize less and let rate control do its thing */
+                    energy2uplim = sqrtf(energy2uplim);
+                }
+                energy2uplim = FFMAX(0.015625f, FFMIN(1.0f, energy2uplim));
+                euplims[w*16+g] *= av_clipf(rdlambda * energy2uplim * sce->ics.group_len[w],
+                    0.5f, 1.0f);
+            }
+            start += sce->ics.swb_sizes[g];
+        }
+    }
+
+    for (i = 0; i < sizeof(maxsf) / sizeof(maxsf[0]); ++i)
+        maxsf[i] = SCALE_MAX_POS;
+
     //perform two-loop search
     //outer loop - improve quality
     do {
-        int tbits, qstep;
-        minscaler = sce->sf_idx[0];
         //inner loop - quantize spectrum to fit into given number of bits
-        qstep = its ? 1 : 32;
+        int overdist;
+        int qstep = its ? 1 : 32;
         do {
             int prev = -1;
+            int changed = 0;
+            recomprd = 0;
             tbits = 0;
             for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
                 start = w*128;
@@ -134,15 +369,20 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
                     int bits = 0;
                     int cb;
                     float dist = 0.0f;
+                    float qenergy = 0.0f;
 
                     if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
                         start += sce->ics.swb_sizes[g];
+                        if (sce->can_pns[w*16+g]) {
+                            /** PNS isn't free */
+                            tbits += ff_pns_bits(sce, w, g);
+                        }
                         continue;
                     }
-                    minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
                     cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
                     for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
                         int b;
+                        float sqenergy;
                         dist += quantize_band_cost(s, coefs + w2*128,
                                                    scaled + w2*128,
                                                    sce->ics.swb_sizes[g],
@@ -150,54 +390,309 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
                                                    cb,
                                                    1.0f,
                                                    INFINITY,
-                                                   &b,
+                                                   &b, &sqenergy,
                                                    0);
                         bits += b;
+                        qenergy += sqenergy;
                     }
                     dists[w*16+g] = dist - bits;
+                    qenergies[w*16+g] = qenergy;
                     if (prev != -1) {
-                        bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
+                        int sfdiff = sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO;
+                        av_assert1(sfdiff >= 0 && sfdiff <= 2*SCALE_MAX_DIFF);
+                        bits += ff_aac_scalefactor_bits[sfdiff];
                     }
                     tbits += bits;
                     start += sce->ics.swb_sizes[g];
                     prev = sce->sf_idx[w*16+g];
                 }
             }
-            if (tbits > destbits) {
-                for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] < 218 - qstep)
-                        sce->sf_idx[i] += qstep;
-            } else {
-                for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] > 60 - qstep)
-                        sce->sf_idx[i] -= qstep;
+            if (tbits > toomanybits) {
+                recomprd = 1;
+                for (i = 0; i < 128; i++) {
+                    if (sce->sf_idx[i] < (SCALE_MAX_POS - SCALE_DIV_512)) {
+                        int maxsf_i = (tbits > 5800) ? SCALE_MAX_POS : maxsf[i];
+                        int new_sf = FFMIN(maxsf_i, sce->sf_idx[i] + qstep);
+                        if (new_sf != sce->sf_idx[i]) {
+                            sce->sf_idx[i] = new_sf;
+                            changed = 1;
+                        }
+                    }
+                }
+            } else if (tbits < toofewbits) {
+                recomprd = 1;
+                for (i = 0; i < 128; i++) {
+                    if (sce->sf_idx[i] > SCALE_ONE_POS) {
+                        int new_sf = FFMAX(SCALE_ONE_POS, sce->sf_idx[i] - qstep);
+                        if (new_sf != sce->sf_idx[i]) {
+                            sce->sf_idx[i] = new_sf;
+                            changed = 1;
+                        }
+                    }
+                }
             }
             qstep >>= 1;
-            if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
+            if (!qstep && tbits > toomanybits && sce->sf_idx[0] < 217 && changed)
                 qstep = 1;
         } while (qstep);
 
-        fflag = 0;
-        minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
+        overdist = 1;
+        for (i = 0; i < 2 && (overdist || recomprd); ++i) {
+            if (recomprd) {
+                /** Must recompute distortion */
+                int prev = -1;
+                tbits = 0;
+                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                    start = w*128;
+                    for (g = 0;  g < sce->ics.num_swb; g++) {
+                        const float *coefs = sce->coeffs + start;
+                        const float *scaled = s->scoefs + start;
+                        int bits = 0;
+                        int cb;
+                        float dist = 0.0f;
+                        float qenergy = 0.0f;
 
+                        if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
+                            start += sce->ics.swb_sizes[g];
+                            if (sce->can_pns[w*16+g]) {
+                                /** PNS isn't free */
+                                tbits += ff_pns_bits(sce, w, g);
+                            }
+                            continue;
+                        }
+                        cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                        for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                            int b;
+                            float sqenergy;
+                            dist += quantize_band_cost(s, coefs + w2*128,
+                                                    scaled + w2*128,
+                                                    sce->ics.swb_sizes[g],
+                                                    sce->sf_idx[w*16+g],
+                                                    cb,
+                                                    1.0f,
+                                                    INFINITY,
+                                                    &b, &sqenergy,
+                                                    0);
+                            bits += b;
+                            qenergy += sqenergy;
+                        }
+                        dists[w*16+g] = dist - bits;
+                        qenergies[w*16+g] = qenergy;
+                        if (prev != -1) {
+                            int sfdiff = sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO;
+                            av_assert1(sfdiff >= 0 && sfdiff <= 2*SCALE_MAX_DIFF);
+                            bits += ff_aac_scalefactor_bits[sfdiff];
+                        }
+                        tbits += bits;
+                        start += sce->ics.swb_sizes[g];
+                        prev = sce->sf_idx[w*16+g];
+                    }
+                }
+            }
+            if (!i && s->options.pns && its > maxits/2) {
+                float maxoverdist = 0.0f;
+                overdist = recomprd = 0;
+                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                    float ovrfactor = 2.f+(maxits-its)*16.f/maxits;
+                    for (g = start = 0;  g < sce->ics.num_swb; start += sce->ics.swb_sizes[g++]) {
+                        if (!sce->zeroes[w*16+g] && dists[w*16+g] > uplims[w*16+g]*ovrfactor) {
+                            float ovrdist = dists[w*16+g] / FFMAX(uplims[w*16+g],euplims[w*16+g]);
+                            maxoverdist = FFMAX(maxoverdist, ovrdist);
+                            overdist++;
+                        }
+                    }
+                }
+                if (overdist) {
+                    /* We have overdistorted bands, trade for zeroes (that can be noise)
+                     * Zero the bands in the lowest 1.25% spread-energy-threshold ranking
+                     */
+                    float minspread = max_spread_thr_r;
+                    float maxspread = min_spread_thr_r;
+                    float zspread;
+                    int zeroable = 0;
+                    int zeroed = 0;
+                    int maxzeroed;
+                    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                        for (g = start = 0;  g < sce->ics.num_swb; start += sce->ics.swb_sizes[g++]) {
+                            if (start >= pns_start_pos && !sce->zeroes[w*16+g] && sce->can_pns[w*16+g]) {
+                                minspread = FFMIN(minspread, spread_thr_r[w*16+g]);
+                                maxspread = FFMAX(maxspread, spread_thr_r[w*16+g]);
+                                zeroable++;
+                            }
+                        }
+                    }
+                    zspread = (maxspread-minspread) * 0.0125f + minspread;
+                    zspread = FFMIN(maxoverdist, zspread);
+                    maxzeroed = zeroable * its / (2 * maxits);
+                    for (g = sce->ics.num_swb-1; g > 0 && zeroed < maxzeroed; g--) {
+                        if (sce->ics.swb_offset[g] < pns_start_pos)
+                            continue;
+                        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                            if (!sce->zeroes[w*16+g] && sce->can_pns[w*16+g] && spread_thr_r[w*16+g] <= zspread) {
+                                sce->zeroes[w*16+g] = 1;
+                                sce->band_type[w*16+g] = 0;
+                                zeroed++;
+                            }
+                        }
+                    }
+                    if (zeroed)
+                        recomprd = 1;
+                } else {
+                    overdist = 0;
+                }
+            }
+        }
+
+        minscaler = SCALE_MAX_POS;
+        maxscaler = 0;
         for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+            for (g = 0;  g < sce->ics.num_swb; g++) {
+                if (!sce->zeroes[w*16+g]) {
+                    minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
+                    maxscaler = FFMAX(maxscaler, sce->sf_idx[w*16+g]);
+                }
+            }
+        }
+
+        fflag = 0;
+        minscaler = nminscaler = av_clip(minscaler, SCALE_ONE_POS - SCALE_DIV_512, SCALE_MAX_POS - SCALE_DIV_512);
+        minrdsf = FFMAX3(60, minscaler - 1, maxscaler - SCALE_MAX_DIFF - 1);
+        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+            /** Start with big steps, end up fine-tunning */
+            int depth = (its > maxits/2) ? ((its > maxits*2/3) ? 1 : 3) : 10;
+            int edepth = depth+2;
+            float uplmax = its / (maxits*0.25f) + 1.0f;
+            uplmax *= (tbits > destbits) ? FFMIN(2.0f, tbits / (float)FFMAX(1,destbits)) : 1.0f;
+            start = w * 128;
             for (g = 0; g < sce->ics.num_swb; g++) {
                 int prevsc = sce->sf_idx[w*16+g];
-                if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
-                    if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
-                        sce->sf_idx[w*16+g]--;
-                    else //Try to make sure there is some energy in every band
-                        sce->sf_idx[w*16+g]-=2;
+                int minrdsfboost = (sce->ics.num_windows > 1) ? av_clip(g-4, -2, 0) : av_clip(g-16, -4, 0);
+                if (!sce->zeroes[w*16+g]) {
+                    const float *coefs = sce->coeffs + start;
+                    const float *scaled = s->scoefs + start;
+                    int cmb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                    if ((!cmb || dists[w*16+g] > uplims[w*16+g]) && sce->sf_idx[w*16+g] > minrdsf) {
+                        /* Try to make sure there is some energy in every nonzero band
+                         * NOTE: This algorithm must be forcibly imbalanced, pushing harder
+                         *  on holes or more distorted bands at first, otherwise there's
+                         *  no net gain (since the next iteration will offset all bands
+                         *  on the opposite direction to compensate for extra bits)
+                         */
+                        for (i = 0; i < edepth; ++i) {
+                            int cb, bits;
+                            float dist, qenergy;
+                            int mb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1);
+                            cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                            dist = qenergy = 0.f;
+                            bits = 0;
+                            if (!cb) {
+                                maxsf[w*16+g] = FFMIN(sce->sf_idx[w*16+g]-1, maxsf[w*16+g]);
+                            } else if (i >= depth && dists[w*16+g] < euplims[w*16+g]) {
+                                break;
+                            }
+                            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                                int b;
+                                float sqenergy;
+                                dist += quantize_band_cost(s, coefs + w2*128,
+                                                        scaled + w2*128,
+                                                        sce->ics.swb_sizes[g],
+                                                        sce->sf_idx[w*16+g]-1,
+                                                        cb,
+                                                        1.0f,
+                                                        INFINITY,
+                                                        &b, &sqenergy,
+                                                        0);
+                                bits += b;
+                                qenergy += sqenergy;
+                            }
+                            sce->sf_idx[w*16+g]--;
+                            dists[w*16+g] = dist - bits;
+                            qenergies[w*16+g] = qenergy;
+                            if (mb && (sce->sf_idx[w*16+g] < (minrdsf+minrdsfboost) || (
+                                    (dists[w*16+g] < FFMIN(uplmax*uplims[w*16+g], euplims[w*16+g]))
+                                    && (fabsf(qenergies[w*16+g]-energies[w*16+g]) < euplims[w*16+g])
+                                ) )) {
+                                break;
+                            }
+                        }
+                    } else if (tbits > toofewbits && sce->sf_idx[w*16+g] < maxscaler
+                            && (dists[w*16+g] < FFMIN(euplims[w*16+g], uplims[w*16+g]))
+                            && (fabsf(qenergies[w*16+g]-energies[w*16+g]) < euplims[w*16+g])
+                        ) {
+                        /** Um... over target. Save bits for more important stuff. */
+                        for (i = 0; i < depth; ++i) {
+                            int cb, bits;
+                            float dist, qenergy;
+                            cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]+1);
+                            if (cb > 0) {
+                                dist = qenergy = 0.f;
+                                bits = 0;
+                                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                                    int b;
+                                    float sqenergy;
+                                    dist += quantize_band_cost(s, coefs + w2*128,
+                                                            scaled + w2*128,
+                                                            sce->ics.swb_sizes[g],
+                                                            sce->sf_idx[w*16+g]+1,
+                                                            cb,
+                                                            1.0f,
+                                                            INFINITY,
+                                                            &b, &sqenergy,
+                                                            0);
+                                    bits += b;
+                                    qenergy += sqenergy;
+                                }
+                                dist -= bits;
+                                if (dist < FFMIN(euplims[w*16+g], uplims[w*16+g])) {
+                                    sce->sf_idx[w*16+g]++;
+                                    dists[w*16+g] = dist;
+                                    qenergies[w*16+g] = qenergy;
+                                } else {
+                                    break;
+                                }
+                            } else {
+                                maxsf[w*16+g] = FFMIN(sce->sf_idx[w*16+g], maxsf[w*16+g]);
+                                break;
+                            }
+                        }
+                    }
                 }
-                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
-                sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
+                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minrdsf, minscaler + SCALE_MAX_DIFF);
+                sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], SCALE_MAX_POS - SCALE_DIV_512);
                 if (sce->sf_idx[w*16+g] != prevsc)
                     fflag = 1;
+                nminscaler = FFMIN(nminscaler, sce->sf_idx[w*16+g]);
                 sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                start += sce->ics.swb_sizes[g];
+            }
+        }
+        if (nminscaler < minscaler) {
+            /** Drecreased some scalers below minscaler. Must re-clamp. */
+            minscaler = nminscaler;
+            for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                for (g = 0; g < sce->ics.num_swb; g++) {
+                    sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
+                    sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                }
             }
         }
         its++;
-    } while (fflag && its < 10);
+    } while (fflag && its < maxits);
+
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        /** Make sure proper codebooks are set */
+        for (g = start = 0; g < sce->ics.num_swb; start += sce->ics.swb_sizes[g++]) {
+            if (!sce->zeroes[w*16+g]) {
+                sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                if (sce->band_type[w*16+g] <= 0) {
+                    sce->zeroes[w*16+g] = 1;
+                    sce->band_type[w*16+g] = 0;
+                }
+            } else {
+                sce->band_type[w*16+g] = 0;
+            }
+        }
+    }
 }
 
 #endif /* AVCODEC_AACCODER_TWOLOOP_H */
diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c
index 1b95ebd755..3e21bfffa0 100644
--- a/libavcodec/aacenc.c
+++ b/libavcodec/aacenc.c
@@ -258,6 +258,8 @@ static void apply_intensity_stereo(ChannelElement *cpe)
                     start += ics->swb_sizes[g];
                     continue;
                 }
+                if (cpe->ms_mask[w*16 + g])
+                    p *= -1;
                 for (i = 0; i < ics->swb_sizes[g]; i++) {
                     float sum = (cpe->ch[0].coeffs[start+i] + p*cpe->ch[1].coeffs[start+i])*scale;
                     cpe->ch[0].coeffs[start+i] = sum;
@@ -279,7 +281,7 @@ static void apply_mid_side_stereo(ChannelElement *cpe)
         for (w2 =  0; w2 < ics->group_len[w]; w2++) {
             int start = (w+w2) * 128;
             for (g = 0; g < ics->num_swb; g++) {
-                if (!cpe->ms_mask[w*16 + g]) {
+                if (!cpe->ms_mask[w*16 + g] && !cpe->is_mask[w*16 + g]) {
                     start += ics->swb_sizes[g];
                     continue;
                 }
@@ -490,6 +492,7 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     ChannelElement *cpe;
     SingleChannelElement *sce;
     int i, its, ch, w, chans, tag, start_ch, ret, frame_bits;
+    int target_bits, rate_bits, too_many_bits, too_few_bits;
     int ms_mode = 0, is_mode = 0, tns_mode = 0, pred_mode = 0;
     int chan_el_counter[4];
     FFPsyWindowInfo windows[AAC_MAX_CHANNELS];
@@ -583,8 +586,6 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         return ret;
     frame_bits = its = 0;
     do {
-        int target_bits, too_many_bits, too_few_bits;
-
         init_put_bits(&s->pb, avpkt->data, avpkt->size);
 
         if ((avctx->frame_number & 0xFF)==1 && !(avctx->flags & AV_CODEC_FLAG_BITEXACT))
@@ -618,12 +619,15 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
             s->psy.model->analyze(&s->psy, start_ch, coeffs, wi);
             if (s->psy.bitres.alloc > 0) {
                 /* Lambda unused here on purpose, we need to take psy's unscaled allocation */
-                target_bits += s->psy.bitres.alloc;
+                target_bits += s->psy.bitres.alloc
+                    * (s->lambda / (avctx->global_quality ? avctx->global_quality : 120));
                 s->psy.bitres.alloc /= chans;
             }
             s->cur_type = tag;
             for (ch = 0; ch < chans; ch++) {
                 s->cur_channel = start_ch + ch;
+                if (s->options.pns && s->coder->mark_pns)
+                    s->coder->mark_pns(s, avctx, &cpe->ch[ch]);
                 s->coder->search_for_quantizers(avctx, s, &cpe->ch[ch], s->lambda);
             }
             if (chans > 1
@@ -680,8 +684,6 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                     s->coder->search_for_ms(s, cpe);
                 else if (cpe->common_window)
                     memset(cpe->ms_mask, 1, sizeof(cpe->ms_mask));
-                for (w = 0; w < 128; w++)
-                    cpe->ms_mask[w] = cpe->is_mask[w] ? 0 : cpe->ms_mask[w];
                 apply_mid_side_stereo(cpe);
             }
             adjust_frame_information(cpe, chans);
@@ -708,23 +710,25 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         }
 
         /* rate control stuff
-         * target either the nominal bitrate, or what psy's bit reservoir says to target
-         * whichever is greatest
+         * allow between the nominal bitrate, and what psy's bit reservoir says to target
+         * but drift towards the nominal bitrate always
          */
-
         frame_bits = put_bits_count(&s->pb);
-        target_bits = FFMAX(target_bits, avctx->bit_rate * 1024 / avctx->sample_rate);
-        target_bits = FFMIN(target_bits, 6144 * s->channels - 3);
+        rate_bits = avctx->bit_rate * 1024 / avctx->sample_rate;
+        rate_bits = FFMIN(rate_bits, 6144 * s->channels - 3);
+        too_many_bits = FFMAX(target_bits, rate_bits);
+        too_many_bits = FFMIN(too_many_bits, 6144 * s->channels - 3);
+        too_few_bits = FFMIN(FFMAX(rate_bits - rate_bits/4, target_bits), too_many_bits);
 
         /* When using ABR, be strict (but only for increasing) */
-        too_many_bits = target_bits + target_bits/2;
-        too_few_bits = target_bits - target_bits/8;
+        too_few_bits = too_few_bits - too_few_bits/8;
+        too_many_bits = too_many_bits + too_many_bits/2;
 
         if (   its == 0 /* for steady-state Q-scale tracking */
             || (its < 5 && (frame_bits < too_few_bits || frame_bits > too_many_bits))
             || frame_bits >= 6144 * s->channels - 3  )
         {
-            float ratio = ((float)target_bits) / frame_bits;
+            float ratio = ((float)rate_bits) / frame_bits;
 
             if (frame_bits >= too_few_bits && frame_bits <= too_many_bits) {
                 /*
@@ -742,7 +746,7 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
             s->lambda = FFMIN(s->lambda * ratio, 65536.f);
 
             /* Keep iterating if we must reduce and lambda is in the sky */
-            if (s->lambda < 300.f || ratio > 0.9f) {
+            if ((s->lambda < 300.f || ratio > 0.9f) && (s->lambda > 10.f || ratio < 1.1f)) {
                 break;
             } else {
                 if (is_mode || ms_mode || tns_mode || pred_mode) {
@@ -764,6 +768,8 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     put_bits(&s->pb, 3, TYPE_END);
     flush_put_bits(&s->pb);
     avctx->frame_bits = put_bits_count(&s->pb);
+    s->lambda_sum += s->lambda;
+    s->lambda_count++;
 
     if (!frame)
         s->last_frame++;
@@ -780,6 +786,8 @@ static av_cold int aac_encode_end(AVCodecContext *avctx)
 {
     AACEncContext *s = avctx->priv_data;
 
+    av_log(avctx, AV_LOG_INFO, "Qavg: %.3f\n", s->lambda_sum / s->lambda_count);
+
     ff_mdct_end(&s->mdct1024);
     ff_mdct_end(&s->mdct128);
     ff_psy_end(&s->psy);
diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h
index 7e7609b1a8..99f50edc9c 100644
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@@ -66,6 +66,7 @@ typedef struct AACCoefficientsEncoder {
     void (*apply_tns_filt)(struct AACEncContext *s, SingleChannelElement *sce);
     void (*set_special_band_scalefactors)(struct AACEncContext *s, SingleChannelElement *sce);
     void (*search_for_pns)(struct AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce);
+    void (*mark_pns)(struct AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce);
     void (*search_for_tns)(struct AACEncContext *s, SingleChannelElement *sce);
     void (*search_for_ms)(struct AACEncContext *s, ChannelElement *cpe);
     void (*search_for_is)(struct AACEncContext *s, AVCodecContext *avctx, ChannelElement *cpe);
@@ -100,6 +101,8 @@ typedef struct AACEncContext {
     int last_frame;
     int random_state;
     float lambda;
+    float lambda_sum;                            ///< sum(lambda), for Qvg reporting
+    int lambda_count;                            ///< count(lambda), for Qvg reporting
     enum RawDataBlockType cur_type;              ///< channel group type cur_channel belongs to
 
     AudioFrameQueue afq;
diff --git a/libavcodec/aacenc_is.c b/libavcodec/aacenc_is.c
index e983b7548f..97be9b3412 100644
--- a/libavcodec/aacenc_is.c
+++ b/libavcodec/aacenc_is.c
@@ -45,6 +45,11 @@ struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
     float dist1 = 0.0f, dist2 = 0.0f;
     struct AACISError is_error = {0};
 
+    if (ener01 <= 0 || ener0 <= 0) {
+        is_error.pass = 0;
+        return is_error;
+    }
+
     for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
         FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
         FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
@@ -63,15 +68,15 @@ struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
                                     sce0->ics.swb_sizes[g],
                                     sce0->sf_idx[(w+w2)*16+g],
                                     sce0->band_type[(w+w2)*16+g],
-                                    s->lambda / band0->threshold, INFINITY, NULL, 0);
+                                    s->lambda / band0->threshold, INFINITY, NULL, NULL, 0);
         dist1 += quantize_band_cost(s, &R[start + (w+w2)*128], R34,
                                     sce1->ics.swb_sizes[g],
                                     sce1->sf_idx[(w+w2)*16+g],
                                     sce1->band_type[(w+w2)*16+g],
-                                    s->lambda / band1->threshold, INFINITY, NULL, 0);
+                                    s->lambda / band1->threshold, INFINITY, NULL, NULL, 0);
         dist2 += quantize_band_cost(s, IS, I34, sce0->ics.swb_sizes[g],
                                     is_sf_idx, is_band_type,
-                                    s->lambda / minthr, INFINITY, NULL, 0);
+                                    s->lambda / minthr, INFINITY, NULL, NULL, 0);
         for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
             dist_spec_err += (L34[i] - I34[i])*(L34[i] - I34[i]);
             dist_spec_err += (R34[i] - I34[i]*e01_34)*(R34[i] - I34[i]*e01_34);
@@ -85,6 +90,7 @@ struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
     is_error.error = fabsf(dist1 - dist2);
     is_error.dist1 = dist1;
     is_error.dist2 = dist2;
+    is_error.ener01 = ener01;
 
     return is_error;
 }
@@ -105,7 +111,7 @@ void ff_aac_search_for_is(AACEncContext *s, AVCodecContext *avctx, ChannelElemen
             if (start*freq_mult > INT_STEREO_LOW_LIMIT*(s->lambda/170.0f) &&
                 cpe->ch[0].band_type[w*16+g] != NOISE_BT && !cpe->ch[0].zeroes[w*16+g] &&
                 cpe->ch[1].band_type[w*16+g] != NOISE_BT && !cpe->ch[1].zeroes[w*16+g]) {
-                float ener0 = 0.0f, ener1 = 0.0f, ener01 = 0.0f;
+                float ener0 = 0.0f, ener1 = 0.0f, ener01 = 0.0f, ener01p = 0.0f;
                 struct AACISError ph_err1, ph_err2, *erf;
                 if (sce0->band_type[w*16+g] == NOISE_BT ||
                     sce1->band_type[w*16+g] == NOISE_BT) {
@@ -114,23 +120,25 @@ void ff_aac_search_for_is(AACEncContext *s, AVCodecContext *avctx, ChannelElemen
                 }
                 for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
                     for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
-                        float coef0 = fabsf(sce0->pcoeffs[start+(w+w2)*128+i]);
-                        float coef1 = fabsf(sce1->pcoeffs[start+(w+w2)*128+i]);
+                        float coef0 = fabsf(sce0->coeffs[start+(w+w2)*128+i]);
+                        float coef1 = fabsf(sce1->coeffs[start+(w+w2)*128+i]);
                         ener0  += coef0*coef0;
                         ener1  += coef1*coef1;
                         ener01 += (coef0 + coef1)*(coef0 + coef1);
+                        ener01p += (coef0 - coef1)*(coef0 - coef1);
                     }
                 }
                 ph_err1 = ff_aac_is_encoding_err(s, cpe, start, w, g,
-                                                 ener0, ener1, ener01, 0, -1);
+                                                 ener0, ener1, ener01p, 0, -1);
                 ph_err2 = ff_aac_is_encoding_err(s, cpe, start, w, g,
                                                  ener0, ener1, ener01, 0, +1);
-                erf = ph_err1.error < ph_err2.error ? &ph_err1 : &ph_err2;
+                erf = (ph_err1.pass && ph_err1.error < ph_err2.error) ? &ph_err1 : &ph_err2;
                 if (erf->pass) {
                     cpe->is_mask[w*16+g] = 1;
-                    cpe->ch[0].is_ener[w*16+g] = sqrt(ener0/ener01);
+                    cpe->ms_mask[w*16+g] = 0;
+                    cpe->ch[0].is_ener[w*16+g] = sqrt(ener0 / erf->ener01);
                     cpe->ch[1].is_ener[w*16+g] = ener0/ener1;
-                    cpe->ch[1].band_type[w*16+g] = erf->phase ? INTENSITY_BT : INTENSITY_BT2;
+                    cpe->ch[1].band_type[w*16+g] = (erf->phase > 0) ? INTENSITY_BT : INTENSITY_BT2;
                     count++;
                 }
             }
diff --git a/libavcodec/aacenc_is.h b/libavcodec/aacenc_is.h
index 31bbacac58..269fd1a9c9 100644
--- a/libavcodec/aacenc_is.h
+++ b/libavcodec/aacenc_is.h
@@ -39,6 +39,7 @@ struct AACISError {
     float error; /* fabs(dist1 - dist2)  */
     float dist1; /* From original coeffs */
     float dist2; /* From IS'd coeffs     */
+    float ener01;
 };
 
 struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
diff --git a/libavcodec/aacenc_pred.c b/libavcodec/aacenc_pred.c
index c0e5e6e3b6..7d141930e8 100644
--- a/libavcodec/aacenc_pred.c
+++ b/libavcodec/aacenc_pred.c
@@ -271,7 +271,7 @@ void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce)
         abs_pow34_v(O34, &sce->coeffs[start_coef], num_coeffs);
         dist1 = quantize_and_encode_band_cost(s, NULL, &sce->coeffs[start_coef], NULL,
                                               O34, num_coeffs, sce->sf_idx[sfb],
-                                              cb_n, s->lambda / band->threshold, INFINITY, &cost1, 0);
+                                              cb_n, s->lambda / band->threshold, INFINITY, &cost1, NULL, 0);
         cost_coeffs += cost1;
 
         /* Encoded coefficients - needed for #bits, band type and quant. error */
@@ -284,7 +284,7 @@ void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce)
             cb_p = cb_n;
         quantize_and_encode_band_cost(s, NULL, SENT, QERR, S34, num_coeffs,
                                       sce->sf_idx[sfb], cb_p, s->lambda / band->threshold, INFINITY,
-                                      &cost2, 0);
+                                      &cost2, NULL, 0);
 
         /* Reconstructed coefficients - needed for distortion measurements */
         for (i = 0; i < num_coeffs; i++)
@@ -296,7 +296,7 @@ void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce)
             cb_p = cb_n;
         dist2 = quantize_and_encode_band_cost(s, NULL, &sce->prcoeffs[start_coef], NULL,
                                               P34, num_coeffs, sce->sf_idx[sfb],
-                                              cb_p, s->lambda / band->threshold, INFINITY, NULL, 0);
+                                              cb_p, s->lambda / band->threshold, INFINITY, NULL, NULL, 0);
         for (i = 0; i < num_coeffs; i++)
             dist_spec_err += (O34[i] - P34[i])*(O34[i] - P34[i]);
         dist_spec_err *= s->lambda / band->threshold;
diff --git a/libavcodec/aacenc_quantization.h b/libavcodec/aacenc_quantization.h
index 6776dc37f7..1c3df38e9f 100644
--- a/libavcodec/aacenc_quantization.h
+++ b/libavcodec/aacenc_quantization.h
@@ -43,7 +43,7 @@ static av_always_inline float quantize_and_encode_band_cost_template(
                                 PutBitContext *pb, const float *in, float *out,
                                 const float *scaled, int size, int scale_idx,
                                 int cb, const float lambda, const float uplim,
-                                int *bits, int BT_ZERO, int BT_UNSIGNED,
+                                int *bits, float *energy, int BT_ZERO, int BT_UNSIGNED,
                                 int BT_PAIR, int BT_ESC, int BT_NOISE, int BT_STEREO,
                                 const float ROUNDING)
 {
@@ -54,6 +54,7 @@ static av_always_inline float quantize_and_encode_band_cost_template(
     const float CLIPPED_ESCAPE = 165140.0f*IQ;
     int i, j;
     float cost = 0;
+    float qenergy = 0;
     const int dim = BT_PAIR ? 2 : 4;
     int resbits = 0;
     int off;
@@ -63,6 +64,8 @@ static av_always_inline float quantize_and_encode_band_cost_template(
             cost += in[i]*in[i];
         if (bits)
             *bits = 0;
+        if (energy)
+            *energy = qenergy;
         if (out) {
             for (i = 0; i < size; i += dim)
                 for (j = 0; j < dim; j++)
@@ -113,11 +116,13 @@ static av_always_inline float quantize_and_encode_band_cost_template(
                     out[i+j] = in[i+j] >= 0 ? quantized : -quantized;
                 if (vec[j] != 0.0f)
                     curbits++;
+                qenergy += quantized*quantized;
                 rd += di*di;
             }
         } else {
             for (j = 0; j < dim; j++) {
                 quantized = vec[j]*IQ;
+                qenergy += quantized*quantized;
                 if (out)
                     out[i+j] = quantized;
                 rd += (in[i+j] - quantized)*(in[i+j] - quantized);
@@ -149,6 +154,8 @@ static av_always_inline float quantize_and_encode_band_cost_template(
 
     if (bits)
         *bits = resbits;
+    if (energy)
+        *energy = qenergy;
     return cost;
 }
 
@@ -156,7 +163,7 @@ static inline float quantize_and_encode_band_cost_NONE(struct AACEncContext *s,
                                                 const float *in, float *quant, const float *scaled,
                                                 int size, int scale_idx, int cb,
                                                 const float lambda, const float uplim,
-                                                int *bits) {
+                                                int *bits, float *energy) {
     av_assert0(0);
     return 0.0f;
 }
@@ -167,10 +174,10 @@ static float quantize_and_encode_band_cost_ ## NAME(
                                 PutBitContext *pb, const float *in, float *quant,            \
                                 const float *scaled, int size, int scale_idx,                \
                                 int cb, const float lambda, const float uplim,               \
-                                int *bits) {                                                 \
+                                int *bits, float *energy) {                                  \
     return quantize_and_encode_band_cost_template(                                           \
                                 s, pb, in, quant, scaled, size, scale_idx,                   \
-                                BT_ESC ? ESC_BT : cb, lambda, uplim, bits,                   \
+                                BT_ESC ? ESC_BT : cb, lambda, uplim, bits, energy,           \
                                 BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC, BT_NOISE, BT_STEREO,  \
                                 ROUNDING);                                                   \
 }
@@ -190,7 +197,7 @@ static float (*const quantize_and_encode_band_cost_arr[])(
                                 PutBitContext *pb, const float *in, float *quant,
                                 const float *scaled, int size, int scale_idx,
                                 int cb, const float lambda, const float uplim,
-                                int *bits) = {
+                                int *bits, float *energy) = {
     quantize_and_encode_band_cost_ZERO,
     quantize_and_encode_band_cost_SQUAD,
     quantize_and_encode_band_cost_SQUAD,
@@ -214,7 +221,7 @@ static float (*const quantize_and_encode_band_cost_rtz_arr[])(
                                 PutBitContext *pb, const float *in, float *quant,
                                 const float *scaled, int size, int scale_idx,
                                 int cb, const float lambda, const float uplim,
-                                int *bits) = {
+                                int *bits, float *energy) = {
     quantize_and_encode_band_cost_ZERO,
     quantize_and_encode_band_cost_SQUAD,
     quantize_and_encode_band_cost_SQUAD,
@@ -235,32 +242,32 @@ static float (*const quantize_and_encode_band_cost_rtz_arr[])(
 
 #define quantize_and_encode_band_cost(                                  \
                                 s, pb, in, quant, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits, rtz)               \
+                                lambda, uplim, bits, energy, rtz)               \
     ((rtz) ? quantize_and_encode_band_cost_rtz_arr : quantize_and_encode_band_cost_arr)[cb]( \
                                 s, pb, in, quant, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits)
+                                lambda, uplim, bits, energy)
 
 static inline float quantize_band_cost(struct AACEncContext *s, const float *in,
                                 const float *scaled, int size, int scale_idx,
                                 int cb, const float lambda, const float uplim,
-                                int *bits, int rtz)
+                                int *bits, float *energy, int rtz)
 {
     return quantize_and_encode_band_cost(s, NULL, in, NULL, scaled, size, scale_idx,
-                                         cb, lambda, uplim, bits, rtz);
+                                         cb, lambda, uplim, bits, energy, rtz);
 }
 
 static inline int quantize_band_cost_bits(struct AACEncContext *s, const float *in,
                                 const float *scaled, int size, int scale_idx,
                                 int cb, const float lambda, const float uplim,
-                                int *bits, int rtz)
+                                int *bits, float *energy, int rtz)
 {
-    int _bits;
+    int auxbits;
     quantize_and_encode_band_cost(s, NULL, in, NULL, scaled, size, scale_idx,
-                                         cb, 0.0f, uplim, &_bits, rtz);
+                                         cb, 0.0f, uplim, &auxbits, energy, rtz);
     if (bits) {
-        *bits = _bits;
+        *bits = auxbits;
     }
-    return _bits;
+    return auxbits;
 }
 
 static inline void quantize_and_encode_band(struct AACEncContext *s, PutBitContext *pb,
@@ -268,7 +275,7 @@ static inline void quantize_and_encode_band(struct AACEncContext *s, PutBitConte
                                             int cb, const float lambda, int rtz)
 {
     quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
-                                  INFINITY, NULL, rtz);
+                                  INFINITY, NULL, NULL, rtz);
 }
 
 #endif /* AVCODEC_AACENC_QUANTIZATION_H */
diff --git a/libavcodec/aacenc_utils.h b/libavcodec/aacenc_utils.h
index dbc9554379..b2ce22186b 100644
--- a/libavcodec/aacenc_utils.h
+++ b/libavcodec/aacenc_utils.h
@@ -96,6 +96,54 @@ static inline int find_min_book(float maxval, int sf)
     return cb;
 }
 
+static float find_form_factor(int group_len, int swb_size, float thresh, const float *scaled, float nzslope) {
+    const float iswb_size = 1.0f / swb_size;
+    const float iswb_sizem1 = 1.0f / (swb_size - 1);
+    const float ethresh = thresh;
+    float form = 0.0f, weight = 0.0f;
+    int w2, i;
+    for (w2 = 0; w2 < group_len; w2++) {
+        float e = 0.0f, e2 = 0.0f, var = 0.0f, maxval = 0.0f;
+        float nzl = 0;
+        for (i = 0; i < swb_size; i++) {
+            float s = fabsf(scaled[w2*128+i]);
+            maxval = FFMAX(maxval, s);
+            e += s;
+            e2 += s *= s;
+            /* We really don't want a hard non-zero-line count, since
+             * even below-threshold lines do add up towards band spectral power.
+             * So, fall steeply towards zero, but smoothly
+             */
+            if (s >= ethresh) {
+                nzl += 1.0f;
+            } else {
+                nzl += powf(s / ethresh, nzslope);
+            }
+        }
+        if (e2 > thresh) {
+            float frm;
+            e *= iswb_size;
+
+            /** compute variance */
+            for (i = 0; i < swb_size; i++) {
+                float d = fabsf(scaled[w2*128+i]) - e;
+                var += d*d;
+            }
+            var = sqrtf(var * iswb_sizem1);
+
+            e2 *= iswb_size;
+            frm = e / FFMIN(e+4*var,maxval);
+            form += e2 * sqrtf(frm) / FFMAX(0.5f,nzl);
+            weight += e2;
+        }
+    }
+    if (weight > 0) {
+        return form / weight;
+    } else {
+        return 1.0f;
+    }
+}
+
 /** Return the minimum scalefactor where the quantized coef does not clip. */
 static inline uint8_t coef2minsf(float coef)
 {
@@ -125,6 +173,14 @@ static inline int quant_array_idx(const float val, const float *arr, const int n
     return index;
 }
 
+/**
+ * approximates exp10f(-3.0f*(0.5f + 0.5f * cosf(FFMIN(b,15.5f) / 15.5f)))
+ */
+static av_always_inline float bval2bmax(float b)
+{
+    return 0.001f + 0.0035f * (b*b*b) / (15.5f*15.5f*15.5f);
+}
+
 /*
  * linear congruential pseudorandom number generator, copied from the decoder
  */
diff --git a/libavcodec/aacpsy.c b/libavcodec/aacpsy.c
index af235c758c..34a3ea4296 100644
--- a/libavcodec/aacpsy.c
+++ b/libavcodec/aacpsy.c
@@ -158,6 +158,7 @@ typedef struct AacPsyContext{
     } pe;
     AacPsyCoeffs psy_coef[2][64];
     AacPsyChannel *ch;
+    float global_quality; ///< normalized global quality taken from avctx
 }AacPsyContext;
 
 /**
@@ -300,7 +301,8 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
     float bark;
     int i, j, g, start;
     float prev, minscale, minath, minsnr, pe_min;
-    const int chan_bitrate = ctx->avctx->bit_rate / ctx->avctx->channels;
+    int chan_bitrate = ctx->avctx->bit_rate / ((ctx->avctx->flags & CODEC_FLAG_QSCALE) ? 2.0f : ctx->avctx->channels);
+
     const int bandwidth    = ctx->avctx->cutoff ? ctx->avctx->cutoff : AAC_CUTOFF(ctx->avctx);
     const float num_bark   = calc_bark((float)bandwidth);
 
@@ -308,9 +310,15 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
     if (!ctx->model_priv_data)
         return AVERROR(ENOMEM);
     pctx = (AacPsyContext*) ctx->model_priv_data;
+    pctx->global_quality = (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120) * 0.01f;
+
+    if (ctx->avctx->flags & CODEC_FLAG_QSCALE) {
+        /* Use the target average bitrate to compute spread parameters */
+        chan_bitrate = (int)(chan_bitrate / 120.0 * (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120));
+    }
 
     pctx->chan_bitrate = chan_bitrate;
-    pctx->frame_bits   = chan_bitrate * AAC_BLOCK_SIZE_LONG / ctx->avctx->sample_rate;
+    pctx->frame_bits   = FFMIN(2560, chan_bitrate * AAC_BLOCK_SIZE_LONG / ctx->avctx->sample_rate);
     pctx->pe.min       =  8.0f * AAC_BLOCK_SIZE_LONG * bandwidth / (ctx->avctx->sample_rate * 2.0f);
     pctx->pe.max       = 12.0f * AAC_BLOCK_SIZE_LONG * bandwidth / (ctx->avctx->sample_rate * 2.0f);
     ctx->bitres.size   = 6144 - pctx->frame_bits;
@@ -398,7 +406,7 @@ static av_unused FFPsyWindowInfo psy_3gpp_window(FFPsyContext *ctx,
                                                  int channel, int prev_type)
 {
     int i, j;
-    int br               = ctx->avctx->bit_rate / ctx->avctx->channels;
+    int br               = ((AacPsyContext*)ctx->model_priv_data)->chan_bitrate;
     int attack_ratio     = br <= 16000 ? 18 : 10;
     AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data;
     AacPsyChannel *pch  = &pctx->ch[channel];
@@ -508,7 +516,12 @@ static int calc_bit_demand(AacPsyContext *ctx, float pe, int bits, int size,
     ctx->pe.max = FFMAX(pe, ctx->pe.max);
     ctx->pe.min = FFMIN(pe, ctx->pe.min);
 
-    return FFMIN(ctx->frame_bits * bit_factor, ctx->frame_bits + size - bits);
+    /* NOTE: allocate a minimum of 1/8th average frame bits, to avoid
+     *   reservoir starvation from producing zero-bit frames
+     */
+    return FFMIN(
+        ctx->frame_bits * bit_factor,
+        FFMAX(ctx->frame_bits + size - bits, ctx->frame_bits / 8));
 }
 
 static float calc_pe_3gpp(AacPsyBand *band)
@@ -678,8 +691,26 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
 
     /* 5.6.1.3.2 "Calculation of the desired perceptual entropy" */
     ctx->ch[channel].entropy = pe;
+    if (ctx->avctx->flags & CODEC_FLAG_QSCALE) {
+        /* (2.5 * 120) achieves almost transparent rate, and we want to give
+         * ample room downwards, so we make that equivalent to QSCALE=2.4
+         */
+        desired_pe = pe * (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120) / (2 * 2.5f * 120.0f);
+        desired_bits = FFMIN(2560, PSY_3GPP_PE_TO_BITS(desired_pe));
+        desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits); // reflect clipping
+
+        /* PE slope smoothing */
+        if (ctx->bitres.bits > 0) {
+            desired_bits = FFMIN(2560, PSY_3GPP_PE_TO_BITS(desired_pe));
+            desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits); // reflect clipping
+        }
+
+        pctx->pe.max = FFMAX(pe, pctx->pe.max);
+        pctx->pe.min = FFMIN(pe, pctx->pe.min);
+    } else {
     desired_bits = calc_bit_demand(pctx, pe, ctx->bitres.bits, ctx->bitres.size, wi->num_windows == 8);
     desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits);
+
     /* NOTE: PE correction is kept simple. During initial testing it had very
      *       little effect on the final bitrate. Probably a good idea to come
      *       back and do more testing later.
@@ -687,6 +718,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
     if (ctx->bitres.bits > 0)
         desired_pe *= av_clipf(pctx->pe.previous / PSY_3GPP_BITS_TO_PE(ctx->bitres.bits),
                                0.85f, 1.15f);
+    }
     pctx->pe.previous = PSY_3GPP_BITS_TO_PE(desired_bits);
     ctx->bitres.alloc = desired_bits;
 
diff --git a/libavcodec/mathops.h b/libavcodec/mathops.h
index c065018f56..4988f1d3df 100644
--- a/libavcodec/mathops.h
+++ b/libavcodec/mathops.h
@@ -233,6 +233,11 @@ static inline av_const unsigned int ff_sqrt(unsigned int a)
 }
 #endif
 
+static inline av_const float ff_sqrf(float a)
+{
+    return a*a;
+}
+
 static inline int8_t ff_u8_to_s8(uint8_t a)
 {
     union {
diff --git a/libavcodec/mips/aaccoder_mips.c b/libavcodec/mips/aaccoder_mips.c
index 18d3f88743..e85bf8c5ca 100644
--- a/libavcodec/mips/aaccoder_mips.c
+++ b/libavcodec/mips/aaccoder_mips.c
@@ -178,6 +178,7 @@ static int find_min_book(float maxval, int sf) {
     float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
     float Q34 = sqrtf(Q * sqrtf(Q));
     int qmaxval, cb;
+    qmaxval = maxval * Q34 + 0.4054f;
     if (qmaxval >= (FF_ARRAY_ELEMS(aac_maxval_cb)))
         cb = 11;
     else
@@ -192,12 +193,13 @@ static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
                                                      PutBitContext *pb, const float *in, float *out,
                                                      const float *scaled, int size, int scale_idx,
                                                      int cb, const float lambda, const float uplim,
-                                                     int *bits, const float ROUNDING)
+                                                     int *bits, float *energy, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
 
     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
@@ -262,26 +264,38 @@ static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
 
         put_bits(pb, p_bits[curidx], p_codes[curidx]);
 
-        if (out) {
-           vec = &p_vec[curidx*4];
-           out[i+0] = vec[0] * IQ;
-           out[i+1] = vec[1] * IQ;
-           out[i+2] = vec[2] * IQ;
-           out[i+3] = vec[3] * IQ;
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec = &p_vec[curidx*4];
+            e1 = vec[0] * IQ;
+            e2 = vec[1] * IQ;
+            e3 = vec[2] * IQ;
+            e4 = vec[3] * IQ;
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
         }
     }
+    if (energy)
+        *energy = qenergy;
 }
 
 static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
                                                      PutBitContext *pb, const float *in, float *out,
                                                      const float *scaled, int size, int scale_idx,
                                                      int cb, const float lambda, const float uplim,
-                                                     int *bits, const float ROUNDING)
+                                                     int *bits, float *energy, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
 
     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
@@ -365,26 +379,38 @@ static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
         v_bits  = p_bits[curidx] + count;
         put_bits(pb, v_bits, v_codes);
 
-        if (out) {
-           vec = &p_vec[curidx*4];
-           out[i+0] = copysignf(vec[0] * IQ, in[i+0]);
-           out[i+1] = copysignf(vec[1] * IQ, in[i+1]);
-           out[i+2] = copysignf(vec[2] * IQ, in[i+2]);
-           out[i+3] = copysignf(vec[3] * IQ, in[i+3]);
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec = &p_vec[curidx*4];
+            e1 = copysignf(vec[0] * IQ, in[i+0]);
+            e2 = copysignf(vec[1] * IQ, in[i+1]);
+            e3 = copysignf(vec[2] * IQ, in[i+2]);
+            e4 = copysignf(vec[3] * IQ, in[i+3]);
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
         }
     }
+    if (energy)
+        *energy = qenergy;
 }
 
 static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
                                                      PutBitContext *pb, const float *in, float *out,
                                                      const float *scaled, int size, int scale_idx,
                                                      int cb, const float lambda, const float uplim,
-                                                     int *bits, const float ROUNDING)
+                                                     int *bits, float *energy, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
 
     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
@@ -455,27 +481,39 @@ static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
         v_bits  = p_bits[curidx] + p_bits[curidx2];
         put_bits(pb, v_bits, v_codes);
 
-        if (out) {
-           vec1 = &p_vec[curidx*2 ];
-           vec2 = &p_vec[curidx2*2];
-           out[i+0] = vec1[0] * IQ;
-           out[i+1] = vec1[1] * IQ;
-           out[i+2] = vec2[0] * IQ;
-           out[i+3] = vec2[1] * IQ;
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec1 = &p_vec[curidx*2 ];
+            vec2 = &p_vec[curidx2*2];
+            e1 = vec1[0] * IQ;
+            e2 = vec1[1] * IQ;
+            e3 = vec2[0] * IQ;
+            e4 = vec2[1] * IQ;
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
         }
     }
+    if (energy)
+        *energy = qenergy;
 }
 
 static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
                                                       PutBitContext *pb, const float *in, float *out,
                                                       const float *scaled, int size, int scale_idx,
                                                       int cb, const float lambda, const float uplim,
-                                                      int *bits, const float ROUNDING)
+                                                      int *bits, float *energy, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
 
     uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
@@ -561,27 +599,39 @@ static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
         v_bits  = p_bits[curidx2] + count2;
         put_bits(pb, v_bits, v_codes);
 
-        if (out) {
-           vec1 = &p_vec[curidx1*2];
-           vec2 = &p_vec[curidx2*2];
-           out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
-           out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
-           out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
-           out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec1 = &p_vec[curidx1*2];
+            vec2 = &p_vec[curidx2*2];
+            e1 = copysignf(vec1[0] * IQ, in[i+0]);
+            e2 = copysignf(vec1[1] * IQ, in[i+1]);
+            e3 = copysignf(vec2[0] * IQ, in[i+2]);
+            e4 = copysignf(vec2[1] * IQ, in[i+3]);
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
         }
     }
+    if (energy)
+        *energy = qenergy;
 }
 
 static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
                                                        PutBitContext *pb, const float *in, float *out,
                                                        const float *scaled, int size, int scale_idx,
                                                        int cb, const float lambda, const float uplim,
-                                                       int *bits, const float ROUNDING)
+                                                       int *bits, float *energy, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
 
     uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
@@ -666,27 +716,39 @@ static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
         v_bits  = p_bits[curidx2] + count2;
         put_bits(pb, v_bits, v_codes);
 
-        if (out) {
-           vec1 = &p_vec[curidx1*2];
-           vec2 = &p_vec[curidx2*2];
-           out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
-           out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
-           out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
-           out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec1 = &p_vec[curidx1*2];
+            vec2 = &p_vec[curidx2*2];
+            e1 = copysignf(vec1[0] * IQ, in[i+0]);
+            e2 = copysignf(vec1[1] * IQ, in[i+1]);
+            e3 = copysignf(vec2[0] * IQ, in[i+2]);
+            e4 = copysignf(vec2[1] * IQ, in[i+3]);
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
         }
     }
+    if (energy)
+        *energy = qenergy;
 }
 
 static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
                                                    PutBitContext *pb, const float *in, float *out,
                                                    const float *scaled, int size, int scale_idx,
                                                    int cb, const float lambda, const float uplim,
-                                                   int *bits, const float ROUNDING)
+                                                   int *bits, float *energy, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
 
     uint8_t  *p_bits    = (uint8_t* )ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes   = (uint16_t*)ff_aac_spectral_codes[cb-1];
@@ -772,13 +834,22 @@ static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
             v_bits  = p_bits[curidx2] + count2;
             put_bits(pb, v_bits, v_codes);
 
-            if (out) {
-               vec1 = &p_vectors[curidx*2 ];
-               vec2 = &p_vectors[curidx2*2];
-               out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
-               out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
-               out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
-               out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
+            if (out || energy) {
+                float e1,e2,e3,e4;
+                vec1 = &p_vectors[curidx*2 ];
+                vec2 = &p_vectors[curidx2*2];
+                e1 = copysignf(vec1[0] * IQ, in[i+0]);
+                e2 = copysignf(vec1[1] * IQ, in[i+1]);
+                e3 = copysignf(vec2[0] * IQ, in[i+2]);
+                e4 = copysignf(vec2[1] * IQ, in[i+3]);
+                if (out) {
+                    out[i+0] = e1;
+                    out[i+1] = e2;
+                    out[i+2] = e3;
+                    out[i+3] = e4;
+                }
+                if (energy)
+                    qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
             }
         }
     } else {
@@ -892,23 +963,34 @@ static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
                 put_bits(pb, len * 2 - 3, v_codes);
             }
 
-            if (out) {
-               vec1 = &p_vectors[curidx*2];
-               vec2 = &p_vectors[curidx2*2];
-               out[i+0] = copysignf(c1 * cbrtf(c1) * IQ, in[i+0]);
-               out[i+1] = copysignf(c2 * cbrtf(c2) * IQ, in[i+1]);
-               out[i+2] = copysignf(c3 * cbrtf(c3) * IQ, in[i+2]);
-               out[i+3] = copysignf(c4 * cbrtf(c4) * IQ, in[i+3]);
+            if (out || energy) {
+                float e1, e2, e3, e4;
+                vec1 = &p_vectors[curidx*2];
+                vec2 = &p_vectors[curidx2*2];
+                e1 = copysignf(c1 * cbrtf(c1) * IQ, in[i+0]);
+                e2 = copysignf(c2 * cbrtf(c2) * IQ, in[i+1]);
+                e3 = copysignf(c3 * cbrtf(c3) * IQ, in[i+2]);
+                e4 = copysignf(c4 * cbrtf(c4) * IQ, in[i+3]);
+                if (out) {
+                    out[i+0] = e1;
+                    out[i+1] = e2;
+                    out[i+2] = e3;
+                    out[i+3] = e4;
+                }
+                if (energy)
+                    qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
             }
         }
     }
+    if (energy)
+        *energy = qenergy;
 }
 
 static void quantize_and_encode_band_cost_NONE_mips(struct AACEncContext *s,
                                                          PutBitContext *pb, const float *in, float *out,
                                                          const float *scaled, int size, int scale_idx,
                                                          int cb, const float lambda, const float uplim,
-                                                         int *bits, const float ROUNDING) {
+                                                         int *bits, float *energy, const float ROUNDING) {
     av_assert0(0);
 }
 
@@ -916,7 +998,7 @@ static void quantize_and_encode_band_cost_ZERO_mips(struct AACEncContext *s,
                                                          PutBitContext *pb, const float *in, float *out,
                                                          const float *scaled, int size, int scale_idx,
                                                          int cb, const float lambda, const float uplim,
-                                                         int *bits, const float ROUNDING) {
+                                                         int *bits, float *energy, const float ROUNDING) {
     int i;
     if (bits)
         *bits = 0;
@@ -928,13 +1010,15 @@ static void quantize_and_encode_band_cost_ZERO_mips(struct AACEncContext *s,
            out[i+3] = 0.0f;
         }
     }
+    if (energy)
+        *energy = 0.0f;
 }
 
 static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
                                                          PutBitContext *pb, const float *in, float *out,
                                                          const float *scaled, int size, int scale_idx,
                                                          int cb, const float lambda, const float uplim,
-                                                         int *bits, const float ROUNDING) = {
+                                                         int *bits, float *energy, const float ROUNDING) = {
     quantize_and_encode_band_cost_ZERO_mips,
     quantize_and_encode_band_cost_SQUAD_mips,
     quantize_and_encode_band_cost_SQUAD_mips,
@@ -955,17 +1039,17 @@ static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s
 
 #define quantize_and_encode_band_cost(                                       \
                                 s, pb, in, out, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits, ROUNDING)               \
+                                lambda, uplim, bits, energy, ROUNDING)       \
     quantize_and_encode_band_cost_arr[cb](                                   \
                                 s, pb, in, out, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits, ROUNDING)
+                                lambda, uplim, bits, energy, ROUNDING)
 
 static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
                                           const float *in, float *out, int size, int scale_idx,
                                           int cb, const float lambda, int rtz)
 {
     quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
-                                  INFINITY, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
+                                  INFINITY, NULL, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
 }
 
 /**
@@ -1445,7 +1529,7 @@ static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
 static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
                                      const float *scaled, int size, int scale_idx,
                                      int cb, const float lambda, const float uplim,
-                                     int *bits, int rtz)
+                                     int *bits, float *energy, int rtz)
 {
     return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
 }
@@ -1458,7 +1542,7 @@ static float get_band_cost_ZERO_mips(struct AACEncContext *s,
                                      PutBitContext *pb, const float *in,
                                      const float *scaled, int size, int scale_idx,
                                      int cb, const float lambda, const float uplim,
-                                     int *bits)
+                                     int *bits, float *energy)
 {
     int i;
     float cost = 0;
@@ -1471,6 +1555,8 @@ static float get_band_cost_ZERO_mips(struct AACEncContext *s,
     }
     if (bits)
         *bits = 0;
+    if (energy)
+        *energy = 0.0f;
     return cost * lambda;
 }
 
@@ -1478,7 +1564,7 @@ static float get_band_cost_NONE_mips(struct AACEncContext *s,
                                      PutBitContext *pb, const float *in,
                                      const float *scaled, int size, int scale_idx,
                                      int cb, const float lambda, const float uplim,
-                                     int *bits)
+                                     int *bits, float *energy)
 {
     av_assert0(0);
     return 0;
@@ -1488,12 +1574,13 @@ static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
                                       PutBitContext *pb, const float *in,
                                       const float *scaled, int size, int scale_idx,
                                       int cb, const float lambda, const float uplim,
-                                      int *bits)
+                                      int *bits, float *energy)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     float cost = 0;
+    float qenergy = 0.0f;
     int qc1, qc2, qc3, qc4;
     int curbits = 0;
 
@@ -1560,6 +1647,9 @@ static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
         curbits += p_bits[curidx];
         vec     = &p_codes[curidx*4];
 
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec[2]*vec[2] + vec[3]*vec[3];
+
         __asm__ volatile (
             ".set push                                  \n\t"
             ".set noreorder                             \n\t"
@@ -1594,6 +1684,8 @@ static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
 
     if (bits)
         *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
     return cost * lambda + curbits;
 }
 
@@ -1601,12 +1693,13 @@ static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
                                       PutBitContext *pb, const float *in,
                                       const float *scaled, int size, int scale_idx,
                                       int cb, const float lambda, const float uplim,
-                                      int *bits)
+                                      int *bits, float *energy)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     float cost = 0;
+    float qenergy = 0.0f;
     int curbits = 0;
     int qc1, qc2, qc3, qc4;
 
@@ -1659,6 +1752,9 @@ static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
         curbits += uquad_sign_bits[curidx];
         vec     = &p_codes[curidx*4];
 
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec[2]*vec[2] + vec[3]*vec[3];
+
         __asm__ volatile (
             ".set push                                  \n\t"
             ".set noreorder                             \n\t"
@@ -1696,6 +1792,8 @@ static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
 
     if (bits)
         *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
     return cost * lambda + curbits;
 }
 
@@ -1703,12 +1801,13 @@ static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
                                       PutBitContext *pb, const float *in,
                                       const float *scaled, int size, int scale_idx,
                                       int cb, const float lambda, const float uplim,
-                                      int *bits)
+                                      int *bits, float *energy)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     float cost = 0;
+    float qenergy = 0.0f;
     int qc1, qc2, qc3, qc4;
     int curbits = 0;
 
@@ -1780,6 +1879,9 @@ static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
         vec     = &p_codes[curidx*2];
         vec2    = &p_codes[curidx2*2];
 
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
+
         __asm__ volatile (
             ".set push                                  \n\t"
             ".set noreorder                             \n\t"
@@ -1814,6 +1916,8 @@ static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
 
     if (bits)
         *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
     return cost * lambda + curbits;
 }
 
@@ -1821,12 +1925,13 @@ static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
                                        PutBitContext *pb, const float *in,
                                        const float *scaled, int size, int scale_idx,
                                        int cb, const float lambda, const float uplim,
-                                       int *bits)
+                                       int *bits, float *energy)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     float cost = 0;
+    float qenergy = 0.0f;
     int qc1, qc2, qc3, qc4;
     int curbits = 0;
 
@@ -1910,6 +2015,9 @@ static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
         curbits += upair7_sign_bits[curidx2];
         vec2    = &p_codes[curidx2*2];
 
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
+
         __asm__ volatile (
             ".set push                                          \n\t"
             ".set noreorder                                     \n\t"
@@ -1947,6 +2055,8 @@ static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
 
     if (bits)
         *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
     return cost * lambda + curbits;
 }
 
@@ -1954,12 +2064,13 @@ static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
                                         PutBitContext *pb, const float *in,
                                         const float *scaled, int size, int scale_idx,
                                         int cb, const float lambda, const float uplim,
-                                        int *bits)
+                                        int *bits, float *energy)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     float cost = 0;
+    float qenergy = 0.0f;
     int qc1, qc2, qc3, qc4;
     int curbits = 0;
 
@@ -2043,6 +2154,9 @@ static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
         vec     = &p_codes[curidx*2];
         vec2    = &p_codes[curidx2*2];
 
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
+
         __asm__ volatile (
             ".set push                                          \n\t"
             ".set noreorder                                     \n\t"
@@ -2080,6 +2194,8 @@ static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
 
     if (bits)
         *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
     return cost * lambda + curbits;
 }
 
@@ -2087,13 +2203,14 @@ static float get_band_cost_ESC_mips(struct AACEncContext *s,
                                     PutBitContext *pb, const float *in,
                                     const float *scaled, int size, int scale_idx,
                                     int cb, const float lambda, const float uplim,
-                                    int *bits)
+                                    int *bits, float *energy)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     const float CLIPPED_ESCAPE = 165140.0f * IQ;
     int i;
     float cost = 0;
+    float qenergy = 0.0f;
     int qc1, qc2, qc3, qc4;
     int curbits = 0;
 
@@ -2103,7 +2220,7 @@ static float get_band_cost_ESC_mips(struct AACEncContext *s,
     for (i = 0; i < size; i += 4) {
         const float *vec, *vec2;
         int curidx, curidx2;
-        float t1, t2, t3, t4;
+        float t1, t2, t3, t4, V;
         float di1, di2, di3, di4;
         int cond0, cond1, cond2, cond3;
         int c1, c2, c3, c4;
@@ -2175,38 +2292,54 @@ static float get_band_cost_ESC_mips(struct AACEncContext *s,
         if (cond0) {
             if (t1 >= CLIPPED_ESCAPE) {
                 di1 = t1 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
             } else {
-                di1 = t1 - c1 * cbrtf(c1) * IQ;
+                di1 = t1 - (V = c1 * cbrtf(c1) * IQ);
+                qenergy += V*V;
             }
-        } else
-            di1 = t1 - vec[0] * IQ;
+        } else {
+            di1 = t1 - (V = vec[0] * IQ);
+            qenergy += V*V;
+        }
 
         if (cond1) {
             if (t2 >= CLIPPED_ESCAPE) {
                 di2 = t2 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
             } else {
-                di2 = t2 - c2 * cbrtf(c2) * IQ;
+                di2 = t2 - (V = c2 * cbrtf(c2) * IQ);
+                qenergy += V*V;
             }
-        } else
-            di2 = t2 - vec[1] * IQ;
+        } else {
+            di2 = t2 - (V = vec[1] * IQ);
+            qenergy += V*V;
+        }
 
         if (cond2) {
             if (t3 >= CLIPPED_ESCAPE) {
                 di3 = t3 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
             } else {
-                di3 = t3 - c3 * cbrtf(c3) * IQ;
+                di3 = t3 - (V = c3 * cbrtf(c3) * IQ);
+                qenergy += V*V;
             }
-        } else
-            di3 = t3 - vec2[0] * IQ;
+        } else {
+            di3 = t3 - (V = vec2[0] * IQ);
+            qenergy += V*V;
+        }
 
         if (cond3) {
             if (t4 >= CLIPPED_ESCAPE) {
                 di4 = t4 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
             } else {
-                di4 = t4 - c4 * cbrtf(c4) * IQ;
+                di4 = t4 - (V = c4 * cbrtf(c4) * IQ);
+                qenergy += V*V;
             }
-        } else
-            di4 = t4 - vec2[1]*IQ;
+        } else {
+            di4 = t4 - (V = vec2[1]*IQ);
+            qenergy += V*V;
+        }
 
         cost += di1 * di1 + di2 * di2
                 + di3 * di3 + di4 * di4;
@@ -2221,7 +2354,7 @@ static float (*const get_band_cost_arr[])(struct AACEncContext *s,
                                           PutBitContext *pb, const float *in,
                                           const float *scaled, int size, int scale_idx,
                                           int cb, const float lambda, const float uplim,
-                                          int *bits) = {
+                                          int *bits, float *energy) = {
     get_band_cost_ZERO_mips,
     get_band_cost_SQUAD_mips,
     get_band_cost_SQUAD_mips,
@@ -2242,17 +2375,87 @@ static float (*const get_band_cost_arr[])(struct AACEncContext *s,
 
 #define get_band_cost(                                  \
                                 s, pb, in, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits)                    \
+                                lambda, uplim, bits, energy)            \
     get_band_cost_arr[cb](                              \
                                 s, pb, in, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits)
+                                lambda, uplim, bits, energy)
 
 static float quantize_band_cost(struct AACEncContext *s, const float *in,
                                 const float *scaled, int size, int scale_idx,
                                 int cb, const float lambda, const float uplim,
-                                int *bits, int rtz)
+                                int *bits, float *energy, int rtz)
 {
-    return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
+    return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits, energy);
+}
+
+static float find_form_factor(int group_len, int swb_size, float thresh, const float *scaled, float nzslope) {
+    const float iswb_size = 1.0f / swb_size;
+    const float iswb_sizem1 = 1.0f / (swb_size - 1);
+    const float ethresh = thresh, iethresh = 1.0f / ethresh;
+    float form = 0.0f, weight = 0.0f;
+    int w2, i;
+    for (w2 = 0; w2 < group_len; w2++) {
+        float e = 0.0f, e2 = 0.0f, var = 0.0f, maxval = 0.0f;
+        float nzl = 0;
+        for (i = 0; i < swb_size; i+=4) {
+            float s1 = fabsf(scaled[w2*128+i  ]);
+            float s2 = fabsf(scaled[w2*128+i+1]);
+            float s3 = fabsf(scaled[w2*128+i+2]);
+            float s4 = fabsf(scaled[w2*128+i+3]);
+            maxval = FFMAX(maxval, FFMAX(FFMAX(s1, s2), FFMAX(s3, s4)));
+            e += (s1+s2)+(s3+s4);
+            s1 *= s1;
+            s2 *= s2;
+            s3 *= s3;
+            s4 *= s4;
+            e2 += (s1+s2)+(s3+s4);
+            /* We really don't want a hard non-zero-line count, since
+             * even below-threshold lines do add up towards band spectral power.
+             * So, fall steeply towards zero, but smoothly
+             */
+            if (s1 >= ethresh) {
+                nzl += 1.0f;
+            } else {
+                nzl += powf(s1 * iethresh, nzslope);
+            }
+            if (s2 >= ethresh) {
+                nzl += 1.0f;
+            } else {
+                nzl += powf(s2 * iethresh, nzslope);
+            }
+            if (s3 >= ethresh) {
+                nzl += 1.0f;
+            } else {
+                nzl += powf(s3 * iethresh, nzslope);
+            }
+            if (s4 >= ethresh) {
+                nzl += 1.0f;
+            } else {
+                nzl += powf(s4 * iethresh, nzslope);
+            }
+        }
+        if (e2 > thresh) {
+            float frm;
+            e *= iswb_size;
+
+            /** compute variance */
+            for (i = 0; i < swb_size; i++) {
+                float d = fabsf(scaled[w2*128+i]) - e;
+                var += d*d;
+            }
+            var = sqrtf(var * iswb_sizem1);
+
+            e2 *= iswb_size;
+            frm = e / FFMIN(e+4*var,maxval);
+            form += e2 * sqrtf(frm) / FFMAX(0.5f,nzl);
+            weight += e2;
+        }
+    }
+    if (weight > 0) {
+        return form / weight;
+    } else {
+        return 1.0f;
+    }
 }
 
 #include "libavcodec/aaccoder_twoloop.h"
@@ -2305,25 +2508,25 @@ static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
                                                 sce0->ics.swb_sizes[g],
                                                 sce0->sf_idx[(w+w2)*16+g],
                                                 sce0->band_type[(w+w2)*16+g],
-                                                lambda / band0->threshold, INFINITY, NULL, 0);
+                                                lambda / band0->threshold, INFINITY, NULL, NULL, 0);
                     dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
                                                 R34,
                                                 sce1->ics.swb_sizes[g],
                                                 sce1->sf_idx[(w+w2)*16+g],
                                                 sce1->band_type[(w+w2)*16+g],
-                                                lambda / band1->threshold, INFINITY, NULL, 0);
+                                                lambda / band1->threshold, INFINITY, NULL, NULL, 0);
                     dist2 += quantize_band_cost(s, M,
                                                 M34,
                                                 sce0->ics.swb_sizes[g],
                                                 sce0->sf_idx[(w+w2)*16+g],
                                                 sce0->band_type[(w+w2)*16+g],
-                                                lambda / maxthr, INFINITY, NULL, 0);
+                                                lambda / maxthr, INFINITY, NULL, NULL, 0);
                     dist2 += quantize_band_cost(s, S,
                                                 S34,
                                                 sce1->ics.swb_sizes[g],
                                                 sce1->sf_idx[(w+w2)*16+g],
                                                 sce1->band_type[(w+w2)*16+g],
-                                                lambda / minthr, INFINITY, NULL, 0);
+                                                lambda / minthr, INFINITY, NULL, NULL, 0);
                 }
                 cpe->ms_mask[w*16+g] = dist2 < dist1;
             }
diff --git a/libavcodec/psymodel.c b/libavcodec/psymodel.c
index 824eefb79e..f7bca6890c 100644
--- a/libavcodec/psymodel.c
+++ b/libavcodec/psymodel.c
@@ -109,25 +109,21 @@ av_cold struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *av
         return NULL;
     ctx->avctx = avctx;
 
+    /* AAC has its own LP method */
+    if (avctx->codec_id != AV_CODEC_ID_AAC) {
     if (avctx->cutoff > 0)
         cutoff_coeff = 2.0 * avctx->cutoff / avctx->sample_rate;
 
-    if (!cutoff_coeff && avctx->codec_id == AV_CODEC_ID_AAC)
-        cutoff_coeff = 2.0 * AAC_CUTOFF(avctx) / avctx->sample_rate;
-
     if (cutoff_coeff && cutoff_coeff < 0.98)
     ctx->fcoeffs = ff_iir_filter_init_coeffs(avctx, FF_FILTER_TYPE_BUTTERWORTH,
                                              FF_FILTER_MODE_LOWPASS, FILT_ORDER,
                                              cutoff_coeff, 0.0, 0.0);
     if (ctx->fcoeffs) {
-        ctx->fstate = av_mallocz_array(sizeof(ctx->fstate[0]), avctx->channels);
-        if (!ctx->fstate) {
-            av_free(ctx);
-            return NULL;
-        }
+        ctx->fstate = av_mallocz(sizeof(ctx->fstate[0]) * avctx->channels);
         for (i = 0; i < avctx->channels; i++)
             ctx->fstate[i] = ff_iir_filter_init_state(FILT_ORDER);
     }
+    }
 
     ff_iir_filter_init(&ctx->fiir);
 
diff --git a/libavcodec/psymodel.h b/libavcodec/psymodel.h
index a04cc4d226..565117db73 100644
--- a/libavcodec/psymodel.h
+++ b/libavcodec/psymodel.h
@@ -29,7 +29,20 @@
 /** maximum number of channels */
 #define PSY_MAX_CHANS 20
 
-#define AAC_CUTOFF(s) ((s)->bit_rate ? FFMIN3(4000 + (s)->bit_rate/8, 12000 + (s)->bit_rate/32, (s)->sample_rate / 2) : ((s)->sample_rate / 2))
+/* cutoff for VBR is purposedly increased, since LP filtering actually
+ * hinders VBR performance rather than the opposite
+ */
+#define AAC_CUTOFF_FROM_BITRATE(bit_rate,channels,sample_rate) (bit_rate ? FFMIN3(FFMIN3( \
+    FFMAX(bit_rate/channels/5, bit_rate/channels*15/32 - 5500), \
+    3000 + bit_rate/channels/4, \
+    12000 + bit_rate/channels/16), \
+    22000, \
+    sample_rate / 2): (sample_rate / 2))
+#define AAC_CUTOFF(s) ( \
+    (s->flags & CODEC_FLAG_QSCALE) \
+    ? s->sample_rate / 2 \
+    : AAC_CUTOFF_FROM_BITRATE(s->bit_rate, s->channels, s->sample_rate) \
+)
 
 /**
  * single band psychoacoustic information
diff --git a/tests/fate/aac.mak b/tests/fate/aac.mak
index d6a355e45b..30f0d9b1c3 100644
--- a/tests/fate/aac.mak
+++ b/tests/fate/aac.mak
@@ -146,7 +146,7 @@ fate-aac-aref-encode: CMD = enc_dec_pcm adts wav s16le $(REF) -strict -2 -c:a aa
 fate-aac-aref-encode: CMP = stddev
 fate-aac-aref-encode: REF = ./tests/data/asynth-44100-2.wav
 fate-aac-aref-encode: CMP_SHIFT = -4096
-fate-aac-aref-encode: CMP_TARGET = 584
+fate-aac-aref-encode: CMP_TARGET = 1127
 fate-aac-aref-encode: SIZE_TOLERANCE = 2464
 fate-aac-aref-encode: FUZZ = 6
 
@@ -155,51 +155,52 @@ fate-aac-ln-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-ref
 fate-aac-ln-encode: CMP = stddev
 fate-aac-ln-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-ln-encode: CMP_SHIFT = -4096
-fate-aac-ln-encode: CMP_TARGET = 68
+fate-aac-ln-encode: CMP_TARGET = 80
 fate-aac-ln-encode: SIZE_TOLERANCE = 3560
+fate-aac-ln-encode: FUZZ = 30
 
 FATE_AAC_ENCODE += fate-aac-ln-encode-128k
-fate-aac-ln-encode-128k: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_is 0 -aac_pns 0 -b:a 128k
+fate-aac-ln-encode-128k: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_is 0 -aac_pns 0 -b:a 128k -cutoff 22050
 fate-aac-ln-encode-128k: CMP = stddev
 fate-aac-ln-encode-128k: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-ln-encode-128k: CMP_SHIFT = -4096
-fate-aac-ln-encode-128k: CMP_TARGET = 638
+fate-aac-ln-encode-128k: CMP_TARGET = 745
 fate-aac-ln-encode-128k: SIZE_TOLERANCE = 3560
 fate-aac-ln-encode-128k: FUZZ = 5
 
 FATE_AAC_ENCODE += fate-aac-pns-encode
-fate-aac-pns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_pns 1 -aac_is 0 -b:a 128k
+fate-aac-pns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_pns 1 -aac_is 0 -b:a 128k -cutoff 22050
 fate-aac-pns-encode: CMP = stddev
 fate-aac-pns-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-pns-encode: CMP_SHIFT = -4096
-fate-aac-pns-encode: CMP_TARGET = 623.77
+fate-aac-pns-encode: CMP_TARGET = 695
 fate-aac-pns-encode: SIZE_TOLERANCE = 3560
 fate-aac-pns-encode: FUZZ = 25
 
 FATE_AAC_ENCODE += fate-aac-tns-encode
-fate-aac-tns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_tns 1 -aac_is 0 -aac_pns 0 -b:a 128k
+fate-aac-tns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_tns 1 -aac_is 0 -aac_pns 0 -b:a 128k -cutoff 22050
 fate-aac-tns-encode: CMP = stddev
 fate-aac-tns-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-tns-encode: CMP_SHIFT = -4096
-fate-aac-tns-encode: CMP_TARGET = 644.50
+fate-aac-tns-encode: CMP_TARGET = 768
 fate-aac-tns-encode: FUZZ = 2.8
 fate-aac-tns-encode: SIZE_TOLERANCE = 3560
 
 FATE_AAC_ENCODE += fate-aac-is-encode
-fate-aac-is-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_pns 0 -aac_is 1 -b:a 128k
+fate-aac-is-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_pns 0 -aac_is 1 -b:a 128k -cutoff 22050
 fate-aac-is-encode: CMP = stddev
 fate-aac-is-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-is-encode: CMP_SHIFT = -4096
-fate-aac-is-encode: CMP_TARGET = 614.04
+fate-aac-is-encode: CMP_TARGET = 582
 fate-aac-is-encode: SIZE_TOLERANCE = 3560
 fate-aac-is-encode: FUZZ = 1
 
 FATE_AAC_ENCODE += fate-aac-pred-encode
-fate-aac-pred-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -profile:a aac_main -c:a aac -aac_is 0 -aac_pns 0 -b:a 128k
+fate-aac-pred-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -profile:a aac_main -c:a aac -aac_is 0 -aac_pns 0 -b:a 128k -cutoff 22050
 fate-aac-pred-encode: CMP = stddev
 fate-aac-pred-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-pred-encode: CMP_SHIFT = -4096
-fate-aac-pred-encode: CMP_TARGET = 657
+fate-aac-pred-encode: CMP_TARGET = 790
 fate-aac-pred-encode: FUZZ = 5
 fate-aac-pred-encode: SIZE_TOLERANCE = 3560