patch-libavcodec_aacpsy_c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. $OpenBSD: patch-libavcodec_aacpsy_c,v 1.6 2016/04/07 06:01:19 ajacoutot Exp $
  2. AAC encoder: tweak rate-distortion logic
  3. AAC encoder: Extensive improvements
  4. AAC encoder: cosmetics from last commit
  5. AAC encoder: make pe.min a local minimum
  6. AAC encoder: improve SF range utilization
  7. AAC encoder: fix filling of wi.clipping array
  8. AAC encoder: fix valgrind errors
  9. --- libavcodec/aacpsy.c.orig Wed Jan 13 15:27:48 2016
  10. +++ libavcodec/aacpsy.c Wed Apr 6 18:31:01 2016
  11. @@ -25,6 +25,7 @@
  12. */
  13. #include "libavutil/attributes.h"
  14. +#include "libavutil/internal.h"
  15. #include "libavutil/libm.h"
  16. #include "avcodec.h"
  17. @@ -80,6 +81,8 @@
  18. #define PSY_3GPP_AH_THR_LONG 0.5f
  19. #define PSY_3GPP_AH_THR_SHORT 0.63f
  20. +#define PSY_PE_FORGET_SLOPE 511
  21. +
  22. enum {
  23. PSY_3GPP_AH_NONE,
  24. PSY_3GPP_AH_INACTIVE,
  25. @@ -87,6 +90,7 @@ enum {
  26. };
  27. #define PSY_3GPP_BITS_TO_PE(bits) ((bits) * 1.18f)
  28. +#define PSY_3GPP_PE_TO_BITS(bits) ((bits) / 1.18f)
  29. /* LAME psy model constants */
  30. #define PSY_LAME_FIR_LEN 21 ///< LAME psy model FIR order
  31. @@ -157,6 +161,7 @@ typedef struct AacPsyContext{
  32. } pe;
  33. AacPsyCoeffs psy_coef[2][64];
  34. AacPsyChannel *ch;
  35. + float global_quality; ///< normalized global quality taken from avctx
  36. }AacPsyContext;
  37. /**
  38. @@ -299,17 +304,24 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
  39. float bark;
  40. int i, j, g, start;
  41. float prev, minscale, minath, minsnr, pe_min;
  42. - const int chan_bitrate = ctx->avctx->bit_rate / ctx->avctx->channels;
  43. - const int bandwidth = ctx->avctx->cutoff ? ctx->avctx->cutoff : AAC_CUTOFF(ctx->avctx);
  44. + int chan_bitrate = ctx->avctx->bit_rate / ((ctx->avctx->flags & CODEC_FLAG_QSCALE) ? 2.0f : ctx->avctx->channels);
  45. +
  46. + const int bandwidth = ctx->cutoff ? ctx->cutoff : AAC_CUTOFF(ctx->avctx);
  47. const float num_bark = calc_bark((float)bandwidth);
  48. ctx->model_priv_data = av_mallocz(sizeof(AacPsyContext));
  49. if (!ctx->model_priv_data)
  50. return AVERROR(ENOMEM);
  51. pctx = (AacPsyContext*) ctx->model_priv_data;
  52. + pctx->global_quality = (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120) * 0.01f;
  53. + if (ctx->avctx->flags & CODEC_FLAG_QSCALE) {
  54. + /* Use the target average bitrate to compute spread parameters */
  55. + chan_bitrate = (int)(chan_bitrate / 120.0 * (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120));
  56. + }
  57. +
  58. pctx->chan_bitrate = chan_bitrate;
  59. - pctx->frame_bits = chan_bitrate * AAC_BLOCK_SIZE_LONG / ctx->avctx->sample_rate;
  60. + pctx->frame_bits = FFMIN(2560, chan_bitrate * AAC_BLOCK_SIZE_LONG / ctx->avctx->sample_rate);
  61. pctx->pe.min = 8.0f * AAC_BLOCK_SIZE_LONG * bandwidth / (ctx->avctx->sample_rate * 2.0f);
  62. pctx->pe.max = 12.0f * AAC_BLOCK_SIZE_LONG * bandwidth / (ctx->avctx->sample_rate * 2.0f);
  63. ctx->bitres.size = 6144 - pctx->frame_bits;
  64. @@ -338,10 +350,10 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
  65. for (g = 0; g < ctx->num_bands[j] - 1; g++) {
  66. AacPsyCoeffs *coeff = &coeffs[g];
  67. float bark_width = coeffs[g+1].barks - coeffs->barks;
  68. - coeff->spread_low[0] = pow(10.0, -bark_width * PSY_3GPP_THR_SPREAD_LOW);
  69. - coeff->spread_hi [0] = pow(10.0, -bark_width * PSY_3GPP_THR_SPREAD_HI);
  70. - coeff->spread_low[1] = pow(10.0, -bark_width * en_spread_low);
  71. - coeff->spread_hi [1] = pow(10.0, -bark_width * en_spread_hi);
  72. + coeff->spread_low[0] = ff_exp10(-bark_width * PSY_3GPP_THR_SPREAD_LOW);
  73. + coeff->spread_hi [0] = ff_exp10(-bark_width * PSY_3GPP_THR_SPREAD_HI);
  74. + coeff->spread_low[1] = ff_exp10(-bark_width * en_spread_low);
  75. + coeff->spread_hi [1] = ff_exp10(-bark_width * en_spread_hi);
  76. pe_min = bark_pe * bark_width;
  77. minsnr = exp2(pe_min / band_sizes[g]) - 1.5f;
  78. coeff->min_snr = av_clipf(1.0f / minsnr, PSY_SNR_25DB, PSY_SNR_1DB);
  79. @@ -397,7 +409,7 @@ static av_unused FFPsyWindowInfo psy_3gpp_window(FFPsy
  80. int channel, int prev_type)
  81. {
  82. int i, j;
  83. - int br = ctx->avctx->bit_rate / ctx->avctx->channels;
  84. + int br = ((AacPsyContext*)ctx->model_priv_data)->chan_bitrate;
  85. int attack_ratio = br <= 16000 ? 18 : 10;
  86. AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data;
  87. AacPsyChannel *pch = &pctx->ch[channel];
  88. @@ -486,7 +498,7 @@ static int calc_bit_demand(AacPsyContext *ctx, float p
  89. const float bitspend_add = short_window ? PSY_3GPP_SPEND_ADD_S : PSY_3GPP_SPEND_ADD_L;
  90. const float clip_low = short_window ? PSY_3GPP_CLIP_LO_S : PSY_3GPP_CLIP_LO_L;
  91. const float clip_high = short_window ? PSY_3GPP_CLIP_HI_S : PSY_3GPP_CLIP_HI_L;
  92. - float clipped_pe, bit_save, bit_spend, bit_factor, fill_level;
  93. + float clipped_pe, bit_save, bit_spend, bit_factor, fill_level, forgetful_min_pe;
  94. ctx->fill_level += ctx->frame_bits - bits;
  95. ctx->fill_level = av_clip(ctx->fill_level, 0, size);
  96. @@ -503,11 +515,21 @@ static int calc_bit_demand(AacPsyContext *ctx, float p
  97. * Hopefully below is correct.
  98. */
  99. bit_factor = 1.0f - bit_save + ((bit_spend - bit_save) / (ctx->pe.max - ctx->pe.min)) * (clipped_pe - ctx->pe.min);
  100. - /* NOTE: The reference encoder attempts to center pe max/min around the current pe. */
  101. + /* NOTE: The reference encoder attempts to center pe max/min around the current pe.
  102. + * Here we do that by slowly forgetting pe.min when pe stays in a range that makes
  103. + * it unlikely (ie: above the mean)
  104. + */
  105. ctx->pe.max = FFMAX(pe, ctx->pe.max);
  106. - ctx->pe.min = FFMIN(pe, ctx->pe.min);
  107. + forgetful_min_pe = ((ctx->pe.min * PSY_PE_FORGET_SLOPE)
  108. + + FFMAX(ctx->pe.min, pe * (pe / ctx->pe.max))) / (PSY_PE_FORGET_SLOPE + 1);
  109. + ctx->pe.min = FFMIN(pe, forgetful_min_pe);
  110. - return FFMIN(ctx->frame_bits * bit_factor, ctx->frame_bits + size - bits);
  111. + /* NOTE: allocate a minimum of 1/8th average frame bits, to avoid
  112. + * reservoir starvation from producing zero-bit frames
  113. + */
  114. + return FFMIN(
  115. + ctx->frame_bits * bit_factor,
  116. + FFMAX(ctx->frame_bits + size - bits, ctx->frame_bits / 8));
  117. }
  118. static float calc_pe_3gpp(AacPsyBand *band)
  119. @@ -574,26 +596,30 @@ static float calc_reduced_thr_3gpp(AacPsyBand *band, f
  120. #ifndef calc_thr_3gpp
  121. static void calc_thr_3gpp(const FFPsyWindowInfo *wi, const int num_bands, AacPsyChannel *pch,
  122. - const uint8_t *band_sizes, const float *coefs)
  123. + const uint8_t *band_sizes, const float *coefs, const int cutoff)
  124. {
  125. int i, w, g;
  126. - int start = 0;
  127. + int start = 0, wstart = 0;
  128. for (w = 0; w < wi->num_windows*16; w += 16) {
  129. + wstart = 0;
  130. for (g = 0; g < num_bands; g++) {
  131. AacPsyBand *band = &pch->band[w+g];
  132. float form_factor = 0.0f;
  133. float Temp;
  134. band->energy = 0.0f;
  135. - for (i = 0; i < band_sizes[g]; i++) {
  136. - band->energy += coefs[start+i] * coefs[start+i];
  137. - form_factor += sqrtf(fabs(coefs[start+i]));
  138. + if (wstart < cutoff) {
  139. + for (i = 0; i < band_sizes[g]; i++) {
  140. + band->energy += coefs[start+i] * coefs[start+i];
  141. + form_factor += sqrtf(fabs(coefs[start+i]));
  142. + }
  143. }
  144. Temp = band->energy > 0 ? sqrtf((float)band_sizes[g] / band->energy) : 0;
  145. band->thr = band->energy * 0.001258925f;
  146. band->nz_lines = form_factor * sqrtf(Temp);
  147. start += band_sizes[g];
  148. + wstart += band_sizes[g];
  149. }
  150. }
  151. }
  152. @@ -634,9 +660,11 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx
  153. const uint8_t *band_sizes = ctx->bands[wi->num_windows == 8];
  154. AacPsyCoeffs *coeffs = pctx->psy_coef[wi->num_windows == 8];
  155. const float avoid_hole_thr = wi->num_windows == 8 ? PSY_3GPP_AH_THR_SHORT : PSY_3GPP_AH_THR_LONG;
  156. + const int bandwidth = ctx->cutoff ? ctx->cutoff : AAC_CUTOFF(ctx->avctx);
  157. + const int cutoff = bandwidth * 2048 / wi->num_windows / ctx->avctx->sample_rate;
  158. //calculate energies, initial thresholds and related values - 5.4.2 "Threshold Calculation"
  159. - calc_thr_3gpp(wi, num_bands, pch, band_sizes, coefs);
  160. + calc_thr_3gpp(wi, num_bands, pch, band_sizes, coefs, cutoff);
  161. //modify thresholds and energies - spread, threshold in quiet, pre-echo control
  162. for (w = 0; w < wi->num_windows*16; w += 16) {
  163. @@ -658,7 +686,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx
  164. band->thr_quiet = band->thr = FFMAX(band->thr, coeffs[g].ath);
  165. //5.4.2.5 "Pre-echo control"
  166. - if (!(wi->window_type[0] == LONG_STOP_SEQUENCE || (wi->window_type[1] == LONG_START_SEQUENCE && !w)))
  167. + if (!(wi->window_type[0] == LONG_STOP_SEQUENCE || (!w && wi->window_type[1] == LONG_START_SEQUENCE)))
  168. band->thr = FFMAX(PSY_3GPP_RPEMIN*band->thr, FFMIN(band->thr,
  169. PSY_3GPP_RPELEV*pch->prev_band[w+g].thr_quiet));
  170. @@ -677,16 +705,36 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx
  171. /* 5.6.1.3.2 "Calculation of the desired perceptual entropy" */
  172. ctx->ch[channel].entropy = pe;
  173. - desired_bits = calc_bit_demand(pctx, pe, ctx->bitres.bits, ctx->bitres.size, wi->num_windows == 8);
  174. - desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits);
  175. - /* NOTE: PE correction is kept simple. During initial testing it had very
  176. - * little effect on the final bitrate. Probably a good idea to come
  177. - * back and do more testing later.
  178. - */
  179. - if (ctx->bitres.bits > 0)
  180. - desired_pe *= av_clipf(pctx->pe.previous / PSY_3GPP_BITS_TO_PE(ctx->bitres.bits),
  181. - 0.85f, 1.15f);
  182. + if (ctx->avctx->flags & CODEC_FLAG_QSCALE) {
  183. + /* (2.5 * 120) achieves almost transparent rate, and we want to give
  184. + * ample room downwards, so we make that equivalent to QSCALE=2.4
  185. + */
  186. + desired_pe = pe * (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120) / (2 * 2.5f * 120.0f);
  187. + desired_bits = FFMIN(2560, PSY_3GPP_PE_TO_BITS(desired_pe));
  188. + desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits); // reflect clipping
  189. +
  190. + /* PE slope smoothing */
  191. + if (ctx->bitres.bits > 0) {
  192. + desired_bits = FFMIN(2560, PSY_3GPP_PE_TO_BITS(desired_pe));
  193. + desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits); // reflect clipping
  194. + }
  195. +
  196. + pctx->pe.max = FFMAX(pe, pctx->pe.max);
  197. + pctx->pe.min = FFMIN(pe, pctx->pe.min);
  198. + } else {
  199. + desired_bits = calc_bit_demand(pctx, pe, ctx->bitres.bits, ctx->bitres.size, wi->num_windows == 8);
  200. + desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits);
  201. +
  202. + /* NOTE: PE correction is kept simple. During initial testing it had very
  203. + * little effect on the final bitrate. Probably a good idea to come
  204. + * back and do more testing later.
  205. + */
  206. + if (ctx->bitres.bits > 0)
  207. + desired_pe *= av_clipf(pctx->pe.previous / PSY_3GPP_BITS_TO_PE(ctx->bitres.bits),
  208. + 0.85f, 1.15f);
  209. + }
  210. pctx->pe.previous = PSY_3GPP_BITS_TO_PE(desired_bits);
  211. + ctx->bitres.alloc = desired_bits;
  212. if (desired_pe < pe) {
  213. /* 5.6.1.3.4 "First Estimation of the reduction value" */
  214. @@ -788,6 +836,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx
  215. psy_band->threshold = band->thr;
  216. psy_band->energy = band->energy;
  217. psy_band->spread = band->active_lines * 2.0f / band_sizes[g];
  218. + psy_band->bits = PSY_3GPP_PE_TO_BITS(band->pe);
  219. }
  220. }
  221. @@ -927,21 +976,6 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *c
  222. lame_apply_block_type(pch, &wi, uselongblock);
  223. - /* Calculate input sample maximums and evaluate clipping risk */
  224. - if (audio) {
  225. - for (i = 0; i < AAC_NUM_BLOCKS_SHORT; i++) {
  226. - const float *wbuf = audio + i * AAC_BLOCK_SIZE_SHORT;
  227. - float max = 0;
  228. - int j;
  229. - for (j = 0; j < AAC_BLOCK_SIZE_SHORT; j++)
  230. - max = FFMAX(max, fabsf(wbuf[j]));
  231. - clippings[i] = max;
  232. - }
  233. - } else {
  234. - for (i = 0; i < 8; i++)
  235. - clippings[i] = 0;
  236. - }
  237. -
  238. wi.window_type[1] = prev_type;
  239. if (wi.window_type[0] != EIGHT_SHORT_SEQUENCE) {
  240. float clipping = 0.0f;
  241. @@ -970,9 +1004,10 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *c
  242. for (i = 0; i < 8; i += wi.grouping[i]) {
  243. int w;
  244. float clipping = 0.0f;
  245. - for (w = 0; w < wi.grouping[i] && !clipping; w++)
  246. + for (w = 0; w < wi.grouping[i]; w++)
  247. clipping = FFMAX(clipping, clippings[i+w]);
  248. - wi.clipping[i] = clipping;
  249. + for (w = 0; w < wi.grouping[i]; w++)
  250. + wi.clipping[i+w] = clipping;
  251. }
  252. }