Transcriber.cpp 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. /*
  2. * Transcriber.cpp
  3. * Copyright © 2011-2012 HAL, 2012 kbinani
  4. *
  5. * This file is part of vConnect-STAND.
  6. *
  7. * vConnect-STAND is free software; you can redistribute it and/or
  8. * modify it under the terms of the GPL License.
  9. *
  10. * vConnect-STAND is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. */
  14. #include <time.h>
  15. #include "utau/UtauDB.h"
  16. #include "world/world.h"
  17. #include "Transcriber.h"
  18. #include "vConnectPhoneme.h"
  19. #include "vConnectUtility.h"
  20. #include "Configuration.h"
  21. // 転写システム
  22. void Transcriber::run()
  23. {
  24. string src_path = this->option.getInputPath();
  25. string dst_path = this->option.getOutputPath();
  26. string codepage = this->option.getEncodingOtoIni();
  27. UtauDB src( src_path, codepage.c_str() );
  28. UtauDB dst( dst_path, codepage.c_str() );
  29. Map<string, int> analyzedItems;
  30. for( int i = 0; i < src.size(); i++ ){
  31. UtauParameter src_param, dst_param;
  32. src.getParams(src_param, i);
  33. cout << "Begin analysis : " << src_param.lyric << endl;
  34. Map<string, int>::iterator itr = analyzedItems.find(src_param.fileName);
  35. if( itr != analyzedItems.end() ){
  36. cout << " Already analyzed." << endl;
  37. continue;
  38. }
  39. clock_t cl = clock();
  40. //TODO: ノート番号をとりあえず 60 にしてある.
  41. if( dst.getParams( dst_param, src_param.lyric, 60 ) != 1 ){
  42. cout << " error; not found : " << src_param.lyric << endl;
  43. continue;
  44. }
  45. if( dst_param.isWave != src_param.isWave ){
  46. cout << " error; conflting format : " << src_param.fileName << endl;
  47. continue;
  48. }
  49. if( false == dst_param.isWave ){
  50. // 圧縮形式のマッチング.
  51. vConnectPhoneme src_phoneme, dst_phoneme;
  52. string tmp_path = src.getOtoIniPath() + src_param.fileName;
  53. if( !src_phoneme.readPhoneme( tmp_path.c_str() ) ){
  54. cout << " error; can't read file : " << tmp_path << endl;
  55. continue;
  56. }
  57. tmp_path = dst.getOtoIniPath() + dst_param.fileName;
  58. if( !dst_phoneme.readPhoneme( tmp_path.c_str() ) ){
  59. cout << " error; can't read file : " << tmp_path << endl;
  60. continue;
  61. }
  62. _transcribe_compressed(&src_phoneme, &dst_phoneme);
  63. /* src.getDBPath(tmp_path);
  64. tmp_path += src_param.fileName;
  65. mb_conv(tmp_path, s);
  66. src_phoneme.writePhoneme(s.c_str());
  67. */
  68. tmp_path = dst.getOtoIniPath() + dst_param.fileName;
  69. dst_phoneme.writePhoneme( tmp_path.c_str() );
  70. }else{
  71. vConnectPhoneme src_phoneme, dst_phoneme;
  72. /* ここは生波形用 */
  73. }
  74. cout << "Done. Elapsed time = " << (double)(clock() - cl) / CLOCKS_PER_SEC << " [s]" << endl;
  75. analyzedItems.insert(make_pair(src_param.fileName, i));
  76. }
  77. }
  78. void Transcriber::_transcribe_compressed(vConnectPhoneme *src, vConnectPhoneme *dst)
  79. {
  80. int src_len, dst_len;
  81. double *src_env, *dst_env;
  82. double *src_to_dst, *dst_to_src, *dst_to_src_stretched;
  83. src_len = src->getTimeLength();
  84. dst_len = dst->getTimeLength();
  85. src_env = new double[src_len];
  86. dst_env = new double[dst_len];
  87. src_to_dst = new double[src_len];
  88. dst_to_src = new double[dst_len];
  89. dst_to_src_stretched = new double[src_len];
  90. cout << " extract source compressed wave." << endl;
  91. _calculate_compressed_env(src_env, src, src_len);
  92. cout << " done." << endl;
  93. cout << " extract target compressed wave." << endl;
  94. _calculate_compressed_env(dst_env, dst, dst_len);
  95. cout << " done." << endl;
  96. cout << " calculate matching between two phonemes." << endl;
  97. cout << " ; src length = " << src_len << ", dst length = " << dst_len << endl;
  98. cout << " stretch dst->src." << endl;
  99. for( int i = 0; i < src_len - 1; i++ ){
  100. double tmp = (double)i / (double)src_len * (double)dst_len;
  101. if( tmp >= dst_len - 1 ){
  102. dst_to_src_stretched[i] = dst_env[dst_len-1];
  103. }else{
  104. dst_to_src_stretched[i] = vConnectUtility::interpolateArray(tmp, dst_env);
  105. }
  106. }
  107. dst_to_src_stretched[src_len-1] = dst_env[dst_len-1];
  108. cout << " calculate streching function." << endl;
  109. vConnectUtility::calculateMatching(dst_to_src_stretched, src_to_dst, src_env, dst_to_src_stretched, src_len);
  110. cout << " stretch src->dst." << endl;
  111. double framePeriod = Configuration::getMilliSecondsPerFrame();
  112. for( int i = 0; i < dst_len - 1; i++ ){
  113. double tmp = (double)i / (double)dst_len * (double)src_len;
  114. if( tmp >= src_len - 1 ){
  115. dst_to_src[i] = dst_to_src_stretched[src_len-1];
  116. }else{
  117. dst_to_src[i] = vConnectUtility::interpolateArray(tmp, dst_to_src_stretched) * framePeriod / 1000.0 / (double)src_len * (double)dst_len;
  118. }
  119. }
  120. dst_to_src[dst_len-1] = dst_to_src_stretched[src_len-1] / (double)src_len * (double)dst_len;
  121. for( int i = 0; i < src_len; i++ ){
  122. dst_to_src_stretched[i] = src_to_dst[i] * framePeriod / 1000.0;
  123. }
  124. memcpy( src_to_dst, dst_to_src_stretched, sizeof( double ) * src_len );
  125. cout << " done." << endl;
  126. dst->setTimeAxis( dst_to_src, dst_len );
  127. dst->setTimeAxis( src_to_dst, src_len );
  128. delete[] dst_to_src_stretched;
  129. delete[] src_to_dst;
  130. delete[] dst_to_src;
  131. delete[] src_env;
  132. delete[] dst_env;
  133. }
  134. void Transcriber::_calculate_compressed_env( double *dst, vConnectPhoneme *src, int length )
  135. {
  136. int fftl = 2048;
  137. double *out = new double[fftl];
  138. double *pow_spec = new double[fftl];
  139. double *res_wave = new double[fftl];
  140. fftw_complex *res_spec = new fftw_complex[fftl];
  141. fftw_complex *spectrum = new fftw_complex[fftl];
  142. fftw_complex *cepstrum = new fftw_complex[fftl];
  143. fftw_plan res_forward = fftw_plan_dft_r2c_1d( fftl, res_wave, res_spec, FFTW_ESTIMATE );
  144. fftw_plan forward = fftw_plan_dft_1d( fftl, spectrum, cepstrum, FFTW_FORWARD, FFTW_ESTIMATE );
  145. fftw_plan inverse = fftw_plan_dft_1d( fftl, cepstrum, spectrum, FFTW_BACKWARD, FFTW_ESTIMATE );
  146. fftw_plan inverse_c2r = fftw_plan_dft_c2r_1d( fftl, spectrum, out, FFTW_ESTIMATE );
  147. OggVorbis_File ovf;
  148. src->vorbisOpen( &ovf );
  149. float **pcm_channels;
  150. int sampleRate = Configuration::getDefaultSampleRate();
  151. for( int i = 0; i < length; i++ ){
  152. int mel_len;
  153. int c;
  154. float *mel_cep;
  155. double sum = 0.0;
  156. // ケプストラムからパワースペクトルを計算.
  157. mel_cep = src->getMelCepstrum( i, &mel_len );
  158. vConnectUtility::extractMelCepstrum( pow_spec, fftl, mel_cep, mel_len, spectrum, out, inverse_c2r, sampleRate );
  159. getMinimumPhaseSpectrum( pow_spec, spectrum, cepstrum, fftl, forward, inverse );
  160. // Ogg ストリームから残差波形をデコード.
  161. for( c = 0; c < fftl; ){
  162. int bitStream;
  163. long samples = ov_read_float( &ovf, &pcm_channels, fftl - c, &bitStream );
  164. if( samples <= 0 ){
  165. break;
  166. }
  167. for( int j = 0, k = c; j < samples && k < fftl; j++, k++ ){
  168. res_wave[k] = pcm_channels[0][j];
  169. }
  170. c += samples;
  171. }
  172. // 残差スペクトルの計算.
  173. fftw_execute( res_forward );
  174. for( int k = 0; k <= fftl / 2; k++ ){
  175. double real = spectrum[k][0] * res_spec[k][0] - spectrum[k][1] * res_spec[k][1];
  176. double imag = spectrum[k][1] * res_spec[k][0] + spectrum[k][0] * res_spec[k][1];
  177. spectrum[k][0] = real;
  178. spectrum[k][1] = imag;
  179. }
  180. fftw_execute( inverse_c2r );
  181. for( int j = 0; j < fftl; j++ ){
  182. sum += out[j] * out[j];
  183. }
  184. dst[i] = sum;
  185. }
  186. fftw_destroy_plan( inverse_c2r );
  187. fftw_destroy_plan( forward );
  188. fftw_destroy_plan( inverse );
  189. fftw_destroy_plan( res_forward );
  190. src->vorbisClose( &ovf );
  191. delete[] spectrum;
  192. delete[] cepstrum;
  193. delete[] res_spec;
  194. delete[] res_wave;
  195. delete[] pow_spec;
  196. delete[] out;
  197. }