vConnectPhoneme.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608
  1. #include "vConnectPhoneme.h"
  2. #include <stdio.h>
  3. #include "vConnectUtility.h"
  4. #include "world/world.h"
  5. #include "WaveBuffer/WaveBuffer.h"
  6. #include "worldParameters.h"
  7. #include "Configuration.h"
  8. vConnectPhoneme::vConnectPhoneme()
  9. {
  10. timeLength = 0;
  11. cepstrumLength = 0;
  12. vorbisSize = 0;
  13. framePeriod = 0.0f;
  14. melCepstrum = NULL;
  15. baseTimeAxis = t = f0 = NULL;
  16. vorbisData = NULL;
  17. wave = NULL;
  18. pulseLocations = NULL;
  19. mode = VCNT_UNKNOWN;
  20. waveLength = 0;
  21. waveOffset = 0;
  22. baseTimeLength = 0;
  23. }
  24. vConnectPhoneme::~vConnectPhoneme()
  25. {
  26. destroy();
  27. }
  28. void vConnectPhoneme::destroy()
  29. {
  30. if(melCepstrum) {
  31. for(int i = 0; i < cepstrumLength; i++) {
  32. delete[] melCepstrum[i];
  33. }
  34. delete[] melCepstrum;
  35. }
  36. delete[] f0;
  37. delete[] t;
  38. delete[] baseTimeAxis;
  39. delete[] vorbisData;
  40. delete[] (wave + waveOffset);
  41. delete[] pulseLocations;
  42. timeLength = 0;
  43. baseTimeLength = 0;
  44. cepstrumLength = 0;
  45. vorbisSize = 0;
  46. framePeriod = 0.0f;
  47. waveLength = 0;
  48. waveOffset = 0;
  49. melCepstrum = NULL;
  50. baseTimeAxis = t = f0 = NULL;
  51. mode = VCNT_UNKNOWN;
  52. vorbisData = NULL;
  53. wave = NULL;
  54. pulseLocations = NULL;
  55. }
  56. bool vConnectPhoneme::writePhoneme(const char* path)
  57. {
  58. if( mode != VCNT_COMPRESSED || !melCepstrum || !f0 || !vorbisData
  59. || timeLength <= 0 || cepstrumLength <= 0 || vorbisSize <= 0) {
  60. return false;
  61. }
  62. FILE *fp = fopen(path, "wb");
  63. if(!fp) {
  64. return false;
  65. }
  66. fwrite(&timeLength, sizeof(int), 1, fp);
  67. fwrite(&cepstrumLength, sizeof(float), 1, fp);
  68. fwrite(&framePeriod, sizeof(int), 1, fp);
  69. for(int i = 0; i < timeLength; i++) {
  70. fwrite(melCepstrum[i], sizeof(float), cepstrumLength, fp);
  71. }
  72. fwrite(f0, sizeof(float), timeLength, fp);
  73. fwrite(t , sizeof(float), timeLength, fp);
  74. fwrite(&vorbisSize, sizeof(int), 1, fp);
  75. fwrite(vorbisData, vorbisSize, 1, fp);
  76. if(baseTimeAxis)
  77. {
  78. fwrite(&baseTimeLength, sizeof(int), 1, fp);
  79. fwrite(baseTimeAxis, sizeof(float), baseTimeLength, fp);
  80. }
  81. fclose(fp);
  82. return true;
  83. }
  84. bool vConnectPhoneme::readPhoneme(const char *path)
  85. {
  86. FILE *fp = fopen(path, "rb");
  87. if(!fp) {
  88. return false;
  89. }
  90. destroy();
  91. fread(&timeLength, sizeof(int), 1, fp);
  92. fread(&cepstrumLength, sizeof(float), 1, fp);
  93. fread(&framePeriod, sizeof(int), 1, fp);
  94. f0 = new float[timeLength];
  95. t = new float[timeLength];
  96. melCepstrum = new float*[timeLength];
  97. for(int i = 0; i < timeLength; i++) {
  98. melCepstrum[i] = new float[cepstrumLength];
  99. }
  100. for(int i = 0; i < timeLength; i++) {
  101. fread(melCepstrum[i], sizeof(float), cepstrumLength, fp);
  102. }
  103. fread(f0, sizeof(float), timeLength, fp);
  104. fread(t , sizeof(float), timeLength, fp);
  105. fread(&vorbisSize, sizeof(int), 1, fp);
  106. vorbisData = new char[vorbisSize];
  107. fread(vorbisData, vorbisSize, 1, fp);
  108. if(fread(&baseTimeLength, sizeof(int), 1, fp) == 1 && baseTimeLength > 0)
  109. {
  110. baseTimeAxis = new float[baseTimeLength];
  111. fread(baseTimeAxis, sizeof(float), baseTimeLength, fp);
  112. }
  113. else
  114. {
  115. baseTimeLength = 0;
  116. }
  117. fclose(fp);
  118. mode = VCNT_COMPRESSED;
  119. return true;
  120. }
  121. int vConnectPhoneme::computeWave(double *wave, int length, int fs, double framePeriod, int cepstrumLength)
  122. {
  123. if(!wave || length < 0) { return -1; }
  124. int fftLength;
  125. destroy();
  126. this->framePeriod = (float)framePeriod;
  127. this->timeLength = getSamplesForDIO(fs, length, framePeriod);
  128. this->cepstrumLength = cepstrumLength;
  129. fftLength = getFFTLengthForStar(fs);
  130. double *spectrum = new double[fftLength];
  131. fftw_complex *cepstrum = new fftw_complex[fftLength];
  132. fftw_plan forward = fftw_plan_dft_r2c_1d(fftLength, spectrum, cepstrum, FFTW_ESTIMATE);
  133. fftw_plan inverse = fftw_plan_dft_c2r_1d(fftLength, cepstrum, spectrum, FFTW_ESTIMATE);
  134. double *tmpF0 = new double[timeLength];
  135. double *tmpT = new double[timeLength];
  136. double **specgram = new double*[timeLength];
  137. double **residual = new double*[timeLength];
  138. this->f0 = new float[timeLength];
  139. this->t = new float[timeLength];
  140. this->melCepstrum = new float*[timeLength];
  141. for(int i = 0; i < timeLength; i++) {
  142. this->melCepstrum[i] = new float[cepstrumLength];
  143. }
  144. for(int i = 0; i < timeLength; i++) {
  145. specgram[i] = new double[fftLength];
  146. residual[i] = new double[fftLength];
  147. memset(specgram[i], 0, sizeof(double) * fftLength);
  148. memset(residual[i], 0, sizeof(double) * fftLength);
  149. }
  150. /****************************************************/
  151. /* まず WORLD による分析を行う. */
  152. /* ただしスペクトルの代わりにメルケプを使用する. */
  153. /****************************************************/
  154. dio(wave, length, fs, framePeriod, tmpT, tmpF0);
  155. star(wave, length, fs, tmpT, tmpF0, specgram, false);
  156. for(int i = 0; i < timeLength; i++) {
  157. this->f0[i] = (float)tmpF0[i];
  158. this->t[i] = (float)tmpT[i];
  159. }
  160. /* melCepstrum の計算 → specgram へ再度展開 */
  161. // 展開する場合は melCepstrum の歪み分を残差に持たせる形になる. */
  162. for(int i = 0; i < timeLength; i++) {
  163. int j;
  164. vConnectUtility::stretchToMelScale(spectrum, specgram[i], fftLength / 2 + 1, fs / 2);
  165. for(j = 0; j <= fftLength / 2; j++) {
  166. spectrum[j] = log(spectrum[j]) / fftLength;
  167. }
  168. for(; j < fftLength; j++) {
  169. spectrum[j] = spectrum[fftLength - j];
  170. }
  171. fftw_execute(forward);
  172. for(j = 0; j < cepstrumLength; j++) {
  173. melCepstrum[i][j] = (float)cepstrum[j][0];
  174. cepstrum[j][1] = 0.0;
  175. }
  176. for(; j <= fftLength / 2; j++) {
  177. cepstrum[j][0] = cepstrum[j][1] = 0.0;
  178. }
  179. fftw_execute(inverse);
  180. for(j = 0; j < fftLength; j++) {
  181. spectrum[j] = exp(spectrum[j]);
  182. }
  183. vConnectUtility::stretchFromMelScale(specgram[i], spectrum, fftLength / 2 + 1, fs / 2);
  184. }
  185. platinum_v4(wave, length, fs, tmpT, tmpF0, specgram, residual);
  186. /****************************************************/
  187. /* 次に残差スペクトルを波形の形に持ち直す. */
  188. /* 保存形式は Ogg Vorbis */
  189. /****************************************************/
  190. double *residualWave = new double[fftLength * timeLength];
  191. // 残差波形を計算
  192. for(int i = 0; i < timeLength; i++) {
  193. vConnectUtility::extractResidual(cepstrum, residual[i], fftLength);
  194. fftw_execute(inverse);
  195. for(int j = 0, k = i * fftLength; j < fftLength; j++, k++) {
  196. residualWave[k] = spectrum[j] / (double)fftLength;
  197. }
  198. }
  199. vConnectUtility::newOggVorbis(&vorbisData, &vorbisSize, residualWave, fftLength * timeLength, fs);
  200. delete[] residualWave;
  201. fftw_destroy_plan(forward);
  202. fftw_destroy_plan(inverse);
  203. delete[] spectrum;
  204. delete[] cepstrum;
  205. delete[] tmpF0;
  206. delete[] tmpT;
  207. for(int i = 0; i < timeLength; i++) {
  208. delete[] specgram[i];
  209. delete[] residual[i];
  210. }
  211. delete[] specgram;
  212. delete[] residual;
  213. mode = VCNT_COMPRESSED;
  214. return 0;
  215. }
  216. float *vConnectPhoneme::getMelCepstrum(int index, int *length)
  217. {
  218. if(index < 0) {
  219. index = 0;
  220. } else if(index > timeLength) {
  221. index = timeLength;
  222. }
  223. *length = cepstrumLength;
  224. return melCepstrum[index];
  225. }
  226. float vConnectPhoneme::getF0(int index)
  227. {
  228. if(index < 0) {
  229. index = 0;
  230. } else if(index > timeLength) {
  231. index = timeLength;
  232. }
  233. return f0[index];
  234. }
  235. size_t vConnectPhoneme::vorbisRead(void *dst, size_t size, size_t maxCount, void *vp)
  236. {
  237. V_FILE *p = (V_FILE*)vp;
  238. size_t ret = (p->pos + maxCount * size >= p->size) ? p->size - p->pos : maxCount * size;
  239. memcpy(dst, p->p + p->pos, ret);
  240. p->pos += ret;
  241. return ret / size;
  242. }
  243. int vConnectPhoneme::vorbisSeek(void *vp, ogg_int64_t offset, int flag)
  244. {
  245. V_FILE *p = (V_FILE*)vp;
  246. switch(flag) {
  247. case SEEK_CUR:
  248. p->pos += offset;
  249. break;
  250. case SEEK_END:
  251. p->pos = p->size + offset;
  252. break;
  253. case SEEK_SET:
  254. p->pos = offset;
  255. break;
  256. default:
  257. return -1;
  258. }
  259. if(p->pos > p->size) {
  260. p->pos = p->size;
  261. return -1;
  262. } else if(p->pos < 0) {
  263. p->pos = 0;
  264. return -1;
  265. }
  266. return 0;
  267. }
  268. long vConnectPhoneme::vorbisTell(void *vp)
  269. {
  270. return ((V_FILE*)vp)->pos;
  271. }
  272. int vConnectPhoneme::vorbisClose( void *vp )
  273. {
  274. V_FILE *p = (V_FILE *)vp;
  275. delete p;
  276. return 0;
  277. }
  278. bool vConnectPhoneme::vorbisOpen(OggVorbis_File *ovf)
  279. {
  280. // 保持形式が圧縮形式でない場合.
  281. if(mode != VCNT_COMPRESSED)
  282. {
  283. return false;
  284. }
  285. ov_callbacks callbacks = {
  286. &vConnectPhoneme::vorbisRead,
  287. &vConnectPhoneme::vorbisSeek,
  288. &vConnectPhoneme::vorbisClose,
  289. &vConnectPhoneme::vorbisTell
  290. };
  291. V_FILE *vp = new V_FILE();
  292. vp->p = this->vorbisData;
  293. vp->size = this->vorbisSize;
  294. vp->pos = 0;
  295. if(ov_open_callbacks(vp, ovf, 0, 0, callbacks) != 0) {
  296. return false;
  297. }
  298. return true;
  299. }
  300. void vConnectPhoneme::getOneFrameWorld(double *starSpec,
  301. fftw_complex *residualSpec,
  302. double t, int fftLength,
  303. double *waveform,
  304. fftw_complex *spectrum,
  305. fftw_complex *cepstrum,
  306. fftw_plan forward_r2c,
  307. fftw_plan forward,
  308. fftw_plan inverse)
  309. {
  310. if( mode != VCNT_RAW )
  311. {
  312. for(int i = 0; i < fftLength; i++) {
  313. starSpec[i] = 1.0;
  314. residualSpec[i][0] = residualSpec[i][1] = 0.0;
  315. }
  316. return;
  317. }
  318. int index = (int)(t / this->framePeriod * 1000.0);
  319. if(index < 0)
  320. {
  321. index = 0;
  322. }
  323. if(index >= this->timeLength)
  324. {
  325. index = timeLength - 1;
  326. }
  327. ////// 各バッファと FFT の対応は以下.
  328. // fftw_plan forward_r2c = fftw_plan_dft_r2c_1d(fftLength, waveform, cepstrum, FFTW_ESTIMATE);
  329. // fftw_plan forward = fftw_plan_dft_1d(fftLength, spectrum, cepstrum, FFTW_FORWARD, FFTW_ESTIMATE);
  330. // fftw_plan inverse = fftw_plan_dft_1d(fftLength, cepstrum, spectrum, FFTW_BACKWARD, FFTW_ESTIMATE);
  331. // STAR スペクトルを計算する.
  332. // 残差分を作業用バッファとして使いまわし.
  333. double currentF0 = (f0[index] == 0.0)? DEFAULT_F0 : f0[index];
  334. int sampleRate = Configuration::getDefaultSampleRate();
  335. // starGeneralBody(x, xLen, fs, currentF0, timeAxis[i], fftl, specgram[i], waveform, powerSpec, ySpec,&forwardFFT);
  336. starGeneralBody(wave, waveLength, sampleRate, currentF0, this->t[index], fftLength, starSpec, waveform, waveform, cepstrum, &forward_r2c);
  337. // PLATINUM 残差スペクトルを計算する.
  338. double T0 = (double)sampleRate / currentF0;
  339. int wLen = (int)(0.5 + T0*2.0);
  340. int pulseIndex = pulseLocations[index];
  341. // 波形終了位置を越えてしまっている.
  342. if(wLen+pulseIndex-(int)(0.5+T0) >= waveLength)
  343. {
  344. for(int i = 0;i <= fftLength / 2;i++)
  345. {
  346. residualSpec[i][0] = residualSpec[i][1] = 0.0;
  347. }
  348. } else {
  349. int i;
  350. for(i = 0; i < wLen;i++)
  351. {
  352. int tmpIndex = i + pulseIndex - (int)(0.5+T0);
  353. waveform[i] = wave[max(waveOffset, tmpIndex)] *
  354. (0.5 - 0.5*cos(2.0*PI*(double)(i+1)/((double)(wLen+1))));
  355. }
  356. for(;i < fftLength;i++)
  357. {
  358. waveform[i] = 0.0;
  359. }
  360. // 最小位相スペクトルを求める.
  361. getMinimumPhaseSpectrum(starSpec, spectrum, cepstrum, fftLength, forward, inverse);
  362. // 実波形スペクトルを求める.
  363. fftw_execute(forward_r2c);
  364. // 最小位相はゼロ点を持たないので除算して残差スペクトルを得る.
  365. for(i = 0;i <= fftLength/2;i++)
  366. {
  367. double tmp = spectrum[i][0]*spectrum[i][0] + spectrum[i][1]*spectrum[i][1];
  368. residualSpec[i][0] = ( spectrum[i][0]*cepstrum[i][0] + spectrum[i][1]*cepstrum[i+1][1])/tmp;
  369. residualSpec[i][1] = (-spectrum[i][1]*cepstrum[i][0] + spectrum[i][0]*cepstrum[i+1][1])/tmp;
  370. }
  371. }
  372. }
  373. bool vConnectPhoneme::readRawWave(string dir_path, const UtauParameter *utauParams, double framePeriod)
  374. {
  375. if(!utauParams)
  376. {
  377. return false;
  378. }
  379. WaveBuffer waveFile;
  380. string fileName = utauParams->fileName;
  381. fileName = dir_path + fileName;
  382. if(waveFile.readWaveFile(fileName + ".wav") == 1)
  383. {
  384. worldParameters worldParams;
  385. double *waveBuffer;
  386. int waveLength;
  387. waveBuffer = waveFile.getWavePointer(&waveLength);
  388. // 事前分析データが無いのであれば事前分析を行いファイルとして保存する.
  389. if(worldParams.readParameters((fileName + ".wpd").c_str()) == false)
  390. {
  391. int sampleRate = Configuration::getDefaultSampleRate();
  392. if(worldParams.computeWave(waveBuffer, waveLength, sampleRate, framePeriod))
  393. {
  394. /* 要らない…?
  395. // 同名ファイルの書き込み中に読み込まれるといけない.
  396. #ifdef STND_MULTI_THREAD
  397. if(hMutex)
  398. {
  399. hMutex->lock();
  400. }
  401. #endif
  402. */
  403. worldParams.writeParameters((fileName + ".wpd").c_str());
  404. /*
  405. #ifdef STND_MULTI_THREAD
  406. if(hMutex)
  407. {
  408. hMutex->unlock();
  409. }
  410. #endif
  411. */
  412. }
  413. else
  414. {
  415. // 事前分析に失敗した.
  416. return false;
  417. }
  418. }
  419. destroy();
  420. double beginTime = utauParams->msLeftBlank / 1000.0;
  421. double endTime = (utauParams->msRightBlank < 0) ?
  422. (beginTime - utauParams->msRightBlank / 1000.0) :
  423. ((double)waveLength / (double)waveFile.getSamplingFrequency() - utauParams->msRightBlank / 1000.0);
  424. // 読み込みその他終了したので,波形とパラメタを取り出す.
  425. waveOffset = 0;
  426. timeLength = (int)((endTime - beginTime) / framePeriod * 1000.0 + 0.5);
  427. f0 = new float[timeLength];
  428. t = new float[timeLength];
  429. pulseLocations = new int[timeLength];
  430. worldParams.getParameters(f0, t, pulseLocations, waveFile.getSamplingFrequency(), beginTime, timeLength, framePeriod);
  431. for(int i = 0; i < timeLength; i++)
  432. {
  433. int tmp = (int)(waveFile.getSamplingFrequency() * 1.0 / ((f0[i] == 0.0) ? DEFAULT_F0 : f0[i]) + 0.5);
  434. waveOffset = min(waveOffset, pulseLocations[i] - tmp);
  435. }
  436. int sampleLength = (int)((endTime - beginTime) * waveFile.getSamplingFrequency() + 0.5 - waveOffset);
  437. wave = new double[sampleLength];
  438. int i, j;
  439. // 波形の値を取り出す.
  440. for(i = 0, j = (int)(beginTime * waveFile.getSamplingFrequency() + waveOffset); j < 0; i++, j++)
  441. {
  442. wave[i] = 0.0;
  443. }
  444. for(; i < sampleLength; i++, j++)
  445. {
  446. if(j > waveLength) {
  447. break;
  448. }
  449. wave[i] = waveBuffer[j];
  450. }
  451. for(; i < sampleLength; i++)
  452. {
  453. wave[i] = 0.0;
  454. }
  455. wave -= waveOffset;
  456. mode = VCNT_RAW;
  457. this->framePeriod = framePeriod;
  458. this->waveLength = sampleLength + waveOffset;
  459. // 波形の正規化を行う.
  460. double sum1 = 0.0, sum2 = 0.0;
  461. for(i = 0; i < 2048; i++)
  462. {
  463. sum1 += wave[i] * wave[i];
  464. }
  465. float wavePos = utauParams->msLeftBlank + utauParams->msFixedLength;
  466. for(i = 0, j = (int)(wavePos / 1000.0 * waveFile.getSamplingFrequency()); i < 2048 && j < waveLength - waveOffset; i++, j++)
  467. {
  468. sum2 += waveBuffer[j] * waveBuffer[j];
  469. }
  470. sum1 = max(sum1, sum2);
  471. sum1 = VOL_NORMALIZE / sqrt( (sum1 / 2048.0) );
  472. for(i = waveOffset; i < sampleLength + waveOffset; i++)
  473. {
  474. wave[i] *= sum1;
  475. }
  476. }
  477. return true;
  478. }
  479. void vConnectPhoneme::setTimeAxis(double *t, int length)
  480. {
  481. if(length != timeLength)
  482. {
  483. return;
  484. }
  485. for(int i = 0; i < length; i++)
  486. {
  487. this->t[i] = t[i];
  488. }
  489. }
  490. void vConnectPhoneme::setBaseTimeAxis(double *base, int length)
  491. {
  492. if(length <= 0)
  493. {
  494. return;
  495. }
  496. delete[] baseTimeAxis;
  497. baseTimeAxis = new float[length];
  498. baseTimeLength = length;
  499. for(int i = 0; i < length; i++)
  500. {
  501. this->baseTimeAxis[i] = base[i];
  502. }
  503. }
  504. double vConnectPhoneme::getFrameTime(int index)
  505. {
  506. index = max(0, min(index, timeLength - 1));
  507. return t[index];
  508. }
  509. double vConnectPhoneme::getBaseFrameTime(int index)
  510. {
  511. if(baseTimeAxis == NULL)
  512. {
  513. return (double)index * framePeriod / 1000.0;
  514. }
  515. index = max(0, min(index, baseTimeLength - 1));
  516. return baseTimeAxis[index];
  517. }