codec.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. //-----------------------------------------------------------------------------
  2. // Copyright 2017 Masanori Morise
  3. // Author: mmorise [at] yamanashi.ac.jp (Masanori Morise)
  4. // Last update: 2017/05/09
  5. //
  6. // Coder/decoder functions for the spectral envelope and aperiodicity.
  7. //-----------------------------------------------------------------------------
  8. #include "world/codec.h"
  9. #include <math.h>
  10. #include "world/constantnumbers.h"
  11. #include "world/fft.h"
  12. #include "world/matlabfunctions.h"
  13. namespace {
  14. //-----------------------------------------------------------------------------
  15. // Aperiodicity is initialized by the value 1.0 - world::kMySafeGuardMinimum.
  16. // This value means the frame/frequency index is aperiodic.
  17. //-----------------------------------------------------------------------------
  18. static void InitializeAperiodicity(int f0_length, int fft_size,
  19. double **aperiodicity) {
  20. for (int i = 0; i < f0_length; ++i)
  21. for (int j = 0; j < fft_size / 2 + 1; ++j)
  22. aperiodicity[i][j] = 1.0 - world::kMySafeGuardMinimum;
  23. }
  24. //-----------------------------------------------------------------------------
  25. // This function identifies whether this frame is voiced or unvoiced.
  26. //-----------------------------------------------------------------------------
  27. static int CheckVUV(const double *coarse_aperiodicity,
  28. int number_of_aperiodicities, double *tmp_aperiodicity) {
  29. double tmp = 0.0;
  30. for (int i = 0; i < number_of_aperiodicities; ++i) {
  31. tmp += coarse_aperiodicity[i];
  32. tmp_aperiodicity[i + 1] = coarse_aperiodicity[i];
  33. }
  34. tmp /= number_of_aperiodicities;
  35. return tmp > -0.5 ? 1 : 0; // -0.5 is not optimized, but okay.
  36. }
  37. //-----------------------------------------------------------------------------
  38. // Aperiodicity is obtained from the coded aperiodicity.
  39. //-----------------------------------------------------------------------------
  40. static void GetAperiodicity(const double *coarse_frequency_axis,
  41. const double *coarse_aperiodicity, int number_of_aperiodicities,
  42. const double *frequency_axis, int fft_size, double *aperiodicity) {
  43. interp1(coarse_frequency_axis, coarse_aperiodicity,
  44. number_of_aperiodicities + 2, frequency_axis, fft_size / 2 + 1,
  45. aperiodicity);
  46. for (int i = 0; i <= fft_size / 2; ++i)
  47. aperiodicity[i] = pow(10.0, aperiodicity[i] / 20.0);
  48. }
  49. //-----------------------------------------------------------------------------
  50. // Frequency is converted into its mel representation.
  51. //-----------------------------------------------------------------------------
  52. static inline double FrequencyToMel(double frequency) {
  53. return world::kM0 * log(frequency / world::kF0 + 1.0);
  54. }
  55. //-----------------------------------------------------------------------------
  56. // Mel is converted into frequency.
  57. //-----------------------------------------------------------------------------
  58. static inline double MelToFrequency(double mel) {
  59. return world::kF0 * (exp(mel / world::kM0) - 1.0);
  60. }
  61. //-----------------------------------------------------------------------------
  62. // DCT for spectral envelope coding
  63. //-----------------------------------------------------------------------------
  64. static void DCTForCodec(const double *mel_spectrum, int max_dimension,
  65. const fft_complex *weight, const ForwardRealFFT *forward_real_fft,
  66. int number_of_dimensions, double *mel_cepstrum) {
  67. int bias = max_dimension / 2;
  68. for (int i = 0; i < max_dimension / 2; ++i) {
  69. forward_real_fft->waveform[i] = mel_spectrum[i * 2];
  70. forward_real_fft->waveform[i + bias] =
  71. mel_spectrum[max_dimension - (i * 2) - 1];
  72. }
  73. fft_execute(forward_real_fft->forward_fft);
  74. double normalization = sqrt(forward_real_fft->fft_size);
  75. for (int i = 0; i < number_of_dimensions; ++i)
  76. mel_cepstrum[i] = (forward_real_fft->spectrum[i][0] * weight[i][0] -
  77. forward_real_fft->spectrum[i][1] * weight[i][1]) / normalization;
  78. }
  79. //-----------------------------------------------------------------------------
  80. // IDCT for spectral envelope decoding
  81. //-----------------------------------------------------------------------------
  82. static void IDCTForCodec(const double *mel_cepstrum, int max_dimension,
  83. const fft_complex *weight, const InverseComplexFFT *inverse_complex_fft,
  84. int number_of_dimensions, double *mel_spectrum) {
  85. double normalization = sqrt(inverse_complex_fft->fft_size);
  86. for (int i = 0; i < number_of_dimensions; ++i) {
  87. inverse_complex_fft->input[i][0] =
  88. mel_cepstrum[i] * weight[i][0] * normalization;
  89. inverse_complex_fft->input[i][1] =
  90. -mel_cepstrum[i] * weight[i][1] * normalization;
  91. }
  92. for (int i = number_of_dimensions; i < max_dimension; ++i) {
  93. inverse_complex_fft->input[i][0] = 0.0;
  94. inverse_complex_fft->input[i][1] = 0.0;
  95. }
  96. fft_execute(inverse_complex_fft->inverse_fft);
  97. for (int i = 0; i < max_dimension / 2; ++i) {
  98. mel_spectrum[i * 2] = inverse_complex_fft->output[i][0];
  99. mel_spectrum[(i * 2) + 1] =
  100. inverse_complex_fft->output[max_dimension - i - 1][0];
  101. }
  102. }
  103. //-----------------------------------------------------------------------------
  104. // Spectral envelope in a frame is coded
  105. //-----------------------------------------------------------------------------
  106. static void CodeOneFrame(const double *log_spectral_envelope,
  107. const double *frequency_axis, int fft_size, const double *mel_axis,
  108. const fft_complex *weight, int max_dimension, int number_of_dimensions,
  109. const ForwardRealFFT *forward_real_fft, double *coded_spectral_envelope) {
  110. double *mel_spectrum = new double[max_dimension];
  111. interp1(frequency_axis, log_spectral_envelope, fft_size / 2 + 1,
  112. mel_axis, max_dimension, mel_spectrum);
  113. // DCT
  114. DCTForCodec(mel_spectrum, max_dimension, weight, forward_real_fft,
  115. number_of_dimensions, coded_spectral_envelope);
  116. delete[] mel_spectrum;
  117. }
  118. //-----------------------------------------------------------------------------
  119. // Coded spectral envelope in a frame is decoded
  120. //-----------------------------------------------------------------------------
  121. static void DecodeOneFrame(const double *coded_spectral_envelope,
  122. const double *frequency_axis, int fft_size, const double *mel_axis,
  123. const fft_complex *weight, int max_dimension, int number_of_dimensions,
  124. const InverseComplexFFT *inverse_complex_fft, double *spectral_envelope) {
  125. double *mel_spectrum = new double[max_dimension + 2];
  126. // IDCT
  127. IDCTForCodec(coded_spectral_envelope, max_dimension, weight,
  128. inverse_complex_fft, number_of_dimensions, &mel_spectrum[1]);
  129. mel_spectrum[0] = mel_spectrum[1];
  130. mel_spectrum[max_dimension + 1] = mel_spectrum[max_dimension];
  131. interp1(mel_axis, mel_spectrum, max_dimension + 2, frequency_axis,
  132. fft_size / 2 + 1, spectral_envelope);
  133. for (int i = 0; i < fft_size / 2 + 1; ++i)
  134. spectral_envelope[i] = exp(spectral_envelope[i] / max_dimension);
  135. delete[] mel_spectrum;
  136. }
  137. //-----------------------------------------------------------------------------
  138. // GetParameters() generates the required parameters.
  139. //-----------------------------------------------------------------------------
  140. static void GetParametersForCoding(double floor_frequency,
  141. double ceil_frequency, int fs, int fft_size, double *mel_axis,
  142. double *frequency_axis, fft_complex *weight) {
  143. int max_dimension = fft_size / 2;
  144. double floor_mel = FrequencyToMel(floor_frequency);
  145. double ceil_mel = FrequencyToMel(ceil_frequency);
  146. // Generate the mel axis and the weighting vector for DCT.
  147. for (int i = 0; i < max_dimension; ++i) {
  148. mel_axis[i] = (ceil_mel - floor_mel) * i / max_dimension + floor_mel;
  149. weight[i][0] = 2.0 * cos(i * world::kPi / fft_size) / sqrt(fft_size);
  150. weight[i][1] = 2.0 * sin(i * world::kPi / fft_size) / sqrt(fft_size);
  151. }
  152. weight[0][0] /= sqrt(2.0);
  153. // Generate the frequency axis on mel scale
  154. for (int i = 0; i < max_dimension; ++i)
  155. frequency_axis[i] = FrequencyToMel(static_cast<double>(i) * fs / fft_size);
  156. }
  157. //-----------------------------------------------------------------------------
  158. // GetParameters() generates the required parameters.
  159. //-----------------------------------------------------------------------------
  160. static void GetParametersForDecoding(double floor_frequency,
  161. double ceil_frequency, int fs, int fft_size, int number_of_dimensions,
  162. double *mel_axis, double *frequency_axis, fft_complex *weight) {
  163. int max_dimension = fft_size / 2;
  164. double floor_mel = FrequencyToMel(floor_frequency);
  165. double ceil_mel = FrequencyToMel(ceil_frequency);
  166. // Generate the weighting vector for IDCT.
  167. for (int i = 0; i < number_of_dimensions; ++i) {
  168. weight[i][0] = cos(i * world::kPi / fft_size) * sqrt(fft_size);
  169. weight[i][1] = sin(i * world::kPi / fft_size) * sqrt(fft_size);
  170. }
  171. weight[0][0] /= sqrt(2.0);
  172. // Generate the mel axis for IDCT.
  173. for (int i = 0; i < max_dimension; ++i)
  174. mel_axis[i + 1] =
  175. MelToFrequency((ceil_mel - floor_mel) * i / max_dimension + floor_mel);
  176. mel_axis[0] = 0;
  177. mel_axis[max_dimension + 1] = fs / 2.0;
  178. // Generate the frequency axis
  179. for (int i = 0; i < fft_size / 2 + 1; ++i)
  180. frequency_axis[i] = static_cast<double>(i) * fs / fft_size;
  181. }
  182. } // namespace
  183. int GetNumberOfAperiodicities(int fs) {
  184. return static_cast<int>(MyMinDouble(world::kUpperLimit, fs / 2.0 -
  185. world::kFrequencyInterval) / world::kFrequencyInterval);
  186. }
  187. void CodeAperiodicity(const double * const *aperiodicity, int f0_length,
  188. int fs, int fft_size, double **coded_aperiodicity) {
  189. int number_of_aperiodicities = GetNumberOfAperiodicities(fs);
  190. double *coarse_frequency_axis = new double[number_of_aperiodicities];
  191. for (int i = 0; i < number_of_aperiodicities; ++i)
  192. coarse_frequency_axis[i] = world::kFrequencyInterval * (i + 1.0);
  193. double *log_aperiodicity = new double[fft_size / 2 + 1];
  194. for (int i = 0; i < f0_length; ++i) {
  195. for (int j = 0; j < fft_size / 2 + 1; ++j)
  196. log_aperiodicity[j] = 20 * log10(aperiodicity[i][j]);
  197. interp1Q(0, static_cast<double>(fs) / fft_size, log_aperiodicity,
  198. fft_size / 2 + 1, coarse_frequency_axis, number_of_aperiodicities,
  199. coded_aperiodicity[i]);
  200. }
  201. delete[] coarse_frequency_axis;
  202. delete[] log_aperiodicity;
  203. }
  204. void DecodeAperiodicity(const double * const *coded_aperiodicity,
  205. int f0_length, int fs, int fft_size, double **aperiodicity) {
  206. InitializeAperiodicity(f0_length, fft_size, aperiodicity);
  207. int number_of_aperiodicities = GetNumberOfAperiodicities(fs);
  208. double *frequency_axis = new double[fft_size / 2 + 1];
  209. for (int i = 0; i <= fft_size / 2; ++i)
  210. frequency_axis[i] = static_cast<double>(fs) / fft_size * i;
  211. double *coarse_frequency_axis = new double[number_of_aperiodicities + 2];
  212. for (int i = 0; i <= number_of_aperiodicities; ++i)
  213. coarse_frequency_axis[i] = i * world::kFrequencyInterval;
  214. coarse_frequency_axis[number_of_aperiodicities + 1] = fs / 2.0;
  215. double *coarse_aperiodicity = new double[number_of_aperiodicities + 2];
  216. coarse_aperiodicity[0] = -60.0;
  217. coarse_aperiodicity[number_of_aperiodicities + 1] =
  218. -world::kMySafeGuardMinimum;
  219. for (int i = 0; i < f0_length; ++i) {
  220. if (CheckVUV(coded_aperiodicity[i], number_of_aperiodicities,
  221. coarse_aperiodicity) == 1) continue;
  222. GetAperiodicity(coarse_frequency_axis, coarse_aperiodicity,
  223. number_of_aperiodicities, frequency_axis, fft_size, aperiodicity[i]);
  224. }
  225. delete[] coarse_aperiodicity;
  226. delete[] coarse_frequency_axis;
  227. delete[] frequency_axis;
  228. }
  229. void CodeSpectralEnvelope(const double * const *spectrogram, int f0_length,
  230. int fs, int fft_size, int number_of_dimensions,
  231. double **coded_spectral_envelope) {
  232. double *mel_axis = new double[fft_size / 2];
  233. double *frequency_axis = new double[fft_size / 2 + 1];
  234. double *tmp_spectrum = new double[fft_size / 2 + 1];
  235. fft_complex *weight = new fft_complex[fft_size / 2];
  236. // Generation of the required parameters
  237. GetParametersForCoding(world::kFloorFrequency,
  238. MyMinDouble(fs / 2.0, world::kCeilFrequency), fs, fft_size,
  239. mel_axis, frequency_axis, weight);
  240. ForwardRealFFT forward_real_fft = { 0 };
  241. InitializeForwardRealFFT(fft_size / 2, &forward_real_fft);
  242. for (int i = 0; i < f0_length; ++i) {
  243. for (int j = 0; j < fft_size / 2 + 1; ++j)
  244. tmp_spectrum[j] = log(spectrogram[i][j]);
  245. CodeOneFrame(tmp_spectrum, frequency_axis, fft_size, mel_axis, weight,
  246. fft_size / 2, number_of_dimensions, &forward_real_fft,
  247. coded_spectral_envelope[i]);
  248. }
  249. DestroyForwardRealFFT(&forward_real_fft);
  250. delete[] weight;
  251. delete[] tmp_spectrum;
  252. delete[] frequency_axis;
  253. delete[] mel_axis;
  254. }
  255. void DecodeSpectralEnvelope(const double * const *coded_spectral_envelope,
  256. int f0_length, int fs, int fft_size, int number_of_dimensions,
  257. double **spectrogram) {
  258. double *mel_axis = new double[fft_size / 2 + 2];
  259. double *frequency_axis = new double[fft_size / 2 + 1];
  260. double *tmp_spectrum = new double[fft_size / 2 + 1];
  261. fft_complex *weight = new fft_complex[fft_size / 2];
  262. // Generation of the required parameters
  263. GetParametersForDecoding(world::kFloorFrequency,
  264. MyMinDouble(fs / 2.0, world::kCeilFrequency),
  265. fs, fft_size, number_of_dimensions, mel_axis, frequency_axis, weight);
  266. InverseComplexFFT inverse_complex_fft = { 0 };
  267. InitializeInverseComplexFFT(fft_size / 2, &inverse_complex_fft);
  268. for (int i = 0; i < f0_length; ++i) {
  269. DecodeOneFrame(coded_spectral_envelope[i], frequency_axis, fft_size,
  270. mel_axis, weight, fft_size / 2, number_of_dimensions,
  271. &inverse_complex_fft, spectrogram[i]);
  272. }
  273. DestroyInverseComplexFFT(&inverse_complex_fft);
  274. delete[] weight;
  275. delete[] tmp_spectrum;
  276. delete[] frequency_axis;
  277. delete[] mel_axis;
  278. }