SpeechSynthesizer.cpp 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756
  1. /* SpeechSynthesizer.cpp
  2. *
  3. // * Copyright (C) 2011-2017 David Weenink
  4. *
  5. * This code is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This code is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this work. If not, see <http://www.gnu.org/licenses/>.
  17. */
  18. /*
  19. #include "espeakdata_FileInMemory.h"
  20. djmw 20111214
  21. */
  22. #include "espeak_ng_version.h"
  23. #include "espeak_ng.h"
  24. #include "espeakdata_FileInMemory.h"
  25. #include "SpeechSynthesizer.h"
  26. #include "Strings_extensions.h"
  27. #include "speak_lib.h"
  28. #include "encoding.h"
  29. #include "string.h"
  30. #include "translate.h"
  31. #include "oo_DESTROY.h"
  32. #include "SpeechSynthesizer_def.h"
  33. #include "oo_COPY.h"
  34. #include "SpeechSynthesizer_def.h"
  35. #include "oo_EQUAL.h"
  36. #include "SpeechSynthesizer_def.h"
  37. #include "oo_CAN_WRITE_AS_ENCODING.h"
  38. #include "SpeechSynthesizer_def.h"
  39. #include "oo_WRITE_TEXT.h"
  40. #include "SpeechSynthesizer_def.h"
  41. #include "oo_WRITE_BINARY.h"
  42. #include "SpeechSynthesizer_def.h"
  43. #include "oo_READ_TEXT.h"
  44. #include "SpeechSynthesizer_def.h"
  45. #include "oo_READ_BINARY.h"
  46. #include "SpeechSynthesizer_def.h"
  47. #include "oo_DESCRIPTION.h"
  48. #include "SpeechSynthesizer_def.h"
  49. #define espeak_SAMPLINGFREQUENCY 22050
  50. extern structMelderDir praatDir;
  51. extern int option_phoneme_events;
  52. Thing_implement (EspeakVoice, Daata, 0);
  53. autoEspeakVoice EspeakVoice_create () {
  54. try {
  55. autoEspeakVoice me = Thing_new (EspeakVoice);
  56. my numberOfFormants = 9; // equals N_PEAKS
  57. my numberOfKlattParameters = 8;
  58. my klattv = NUMvector<int32> (1, my numberOfKlattParameters);
  59. my freq = NUMvector<int32> (1, my numberOfFormants);
  60. my height = NUMvector<int32> (1, my numberOfFormants); // 100% = 256
  61. my width = NUMvector<int32> (0, my numberOfFormants); // 100% = 256
  62. my freqadd = NUMvector<int32> (0, my numberOfFormants); // Hz
  63. // copies without temporary adjustments from embedded commands
  64. my freq2 = NUMvector<int32> (0, my numberOfFormants); // 100% = 256
  65. my height2 = NUMvector<int32> (0, my numberOfFormants); // 100% = 256
  66. my width2 = NUMvector<int32> (0, my numberOfFormants); // 100% = 256
  67. my breath = NUMvector<int32> (0, my numberOfFormants); // amount of breath for each formant. breath[0] indicates whether any are set.
  68. my breathw = NUMvector<int32> (0, my numberOfFormants); // width of each breath formant
  69. my numberOfToneAdjusts = 1000; // equals N_TONE_ADJUST in voice.h
  70. my tone_adjust = NUMvector<unsigned char> (1, my numberOfToneAdjusts);
  71. EspeakVoice_setDefaults (me.get());
  72. return me;
  73. } catch (MelderError) {
  74. Melder_throw (U"EspeakVoice not created.");
  75. }
  76. }
  77. void EspeakVoice_setDefaults (EspeakVoice me) {
  78. (void) me;
  79. }
  80. void EspeakVoice_initFromEspeakVoice (EspeakVoice me, voice_t *voice) {
  81. my v_name = Melder_dup (Melder_peek8to32 (voice -> v_name));
  82. my phoneme_tab_ix = voice -> phoneme_tab_ix;
  83. my pitch_base = voice -> pitch_base;
  84. my pitch_range = voice -> pitch_range;
  85. my speedf1 = voice -> speedf1;
  86. my speedf2 = voice -> speedf2;
  87. my speedf3 = voice -> speedf3;
  88. my speed_percent = voice -> speed_percent;
  89. my flutter = voice -> flutter;
  90. my roughness = voice -> roughness;
  91. my echo_delay = voice -> echo_delay;
  92. my echo_amp = voice -> echo_amp;
  93. my n_harmonic_peaks = voice -> n_harmonic_peaks;
  94. my peak_shape = voice -> peak_shape;
  95. my voicing = voice -> voicing;
  96. my formant_factor = voice -> formant_factor;
  97. my consonant_amp = voice -> consonant_amp;
  98. my consonant_ampv = voice -> consonant_ampv;
  99. my samplerate = voice -> samplerate;
  100. my numberOfKlattParameters = 8;
  101. for (integer i = 1; i <= my numberOfKlattParameters; i ++) {
  102. my klattv [i] = voice -> klattv [i - 1];
  103. }
  104. for (integer i = 1; i <= my numberOfFormants; i ++) {
  105. my freq [i] = voice -> freq [i - 1];
  106. my height [i] = voice -> height [i - 1];
  107. my width [i] = voice -> width [i - 1];
  108. my freqadd [i] = voice -> freqadd [i - 1];
  109. my freq2 [i] = voice -> freq2 [i - 1];
  110. my height2 [i] = voice -> height2 [i - 1];
  111. my width2 [i] = voice -> width2 [i - 1];
  112. my breath [i] = voice -> breath [i - 1];
  113. my breathw [i] = voice -> breathw [i - 1];
  114. }
  115. my numberOfToneAdjusts = 1000;
  116. for (integer i = 1; i <= my numberOfToneAdjusts; i ++) {
  117. my tone_adjust [i] = voice -> tone_adjust [i - 1];
  118. }
  119. }
  120. void EspeakVoice_into_voice (EspeakVoice me, voice_t *voice) {
  121. if (my v_name) {
  122. strncpy (voice -> v_name, Melder_peek32to8 (my v_name.get()), 40);
  123. }
  124. if (my language_name) {
  125. strncpy (voice -> language_name, Melder_peek32to8 (my language_name.get()), 20);
  126. }
  127. voice -> phoneme_tab_ix = my phoneme_tab_ix;
  128. voice -> pitch_base = my pitch_base;
  129. voice -> pitch_range = my pitch_range;
  130. voice -> speedf1 = my speedf1;
  131. voice -> speedf2 = my speedf2;
  132. voice -> speedf3 = my speedf3;
  133. voice -> speed_percent = my speed_percent;
  134. voice -> flutter = my flutter;
  135. voice -> roughness = my roughness;
  136. voice -> echo_delay = my echo_delay;
  137. voice -> echo_amp = my echo_amp;
  138. voice -> n_harmonic_peaks = my n_harmonic_peaks;
  139. voice -> peak_shape = my peak_shape;
  140. voice -> voicing = my voicing;
  141. voice -> formant_factor = my formant_factor;
  142. voice -> consonant_amp = my consonant_amp;
  143. voice -> consonant_ampv = my consonant_ampv;
  144. voice -> samplerate = my samplerate;
  145. for (integer i = 1; i <= my numberOfKlattParameters; i ++) {
  146. voice -> klattv [i - 1] = my klattv [i];
  147. }
  148. for (integer i = 1; i <= my numberOfFormants; i ++) {
  149. voice -> freq [i - 1] = my freq [i];
  150. voice -> height [i - 1] = my height [i];
  151. voice -> width [i - 1] = my width [i];
  152. voice -> freqadd [i - 1] = my freqadd [i];
  153. voice -> freq2 [i - 1] = my freq2 [i];
  154. voice -> height2 [i - 1] = my height2 [i];
  155. voice -> width2 [i - 1] = my width2 [i];
  156. voice -> breath [i - 1] = my breath [i];
  157. voice -> breathw [i - 1] = my breathw [i];
  158. }
  159. for (integer i = 1; i <= my numberOfToneAdjusts; i ++) {
  160. voice -> tone_adjust [i - 1] = voice -> tone_adjust [i];
  161. }
  162. }
  163. Thing_implement (SpeechSynthesizer, Daata, 1);
  164. void structSpeechSynthesizer :: v_info () {
  165. our SpeechSynthesizer_Parent :: v_info ();
  166. MelderInfo_writeLine (U"Synthesizer version: espeak-ng ", our d_synthesizerVersion.get());
  167. MelderInfo_writeLine (U"Language: ", our d_languageName.get());
  168. MelderInfo_writeLine (U"Voice: ", our d_voiceName.get());
  169. MelderInfo_writeLine (U"Phoneme set: ", our d_phonemeSet.get());
  170. MelderInfo_writeLine (U"Input text format: ", (our d_inputTextFormat == SpeechSynthesizer_INPUT_TEXTONLY ? U"text only" :
  171. d_inputTextFormat == SpeechSynthesizer_INPUT_PHONEMESONLY ? U"phonemes only" : U"tagged text"));
  172. MelderInfo_writeLine (U"Input phoneme coding: ", (our d_inputPhonemeCoding == SpeechSynthesizer_PHONEMECODINGS_KIRSHENBAUM ? U"Kirshenbaum" : U"???"));
  173. MelderInfo_writeLine (U"Sampling frequency: ", our d_samplingFrequency, U" Hz");
  174. MelderInfo_writeLine (U"Word gap: ", our d_wordgap, U" s");
  175. MelderInfo_writeLine (U"Pitch multiplier: ", our d_pitchAdjustment, U" (0.5-2.0)");
  176. MelderInfo_writeLine (U"Pitch range multiplier: ", our d_pitchRange, U" (0.0-2.0)");
  177. MelderInfo_writeLine (U"Speaking rate: ", our d_wordsPerMinute, U" words per minute",
  178. our d_estimateSpeechRate ? U" (but estimated from speech if possible)" : U" (fixed)");
  179. MelderInfo_writeLine (U"Output phoneme coding: ",
  180. our d_inputPhonemeCoding == SpeechSynthesizer_PHONEMECODINGS_KIRSHENBAUM ? U"Kirshenbaum" :
  181. our d_inputPhonemeCoding == SpeechSynthesizer_PHONEMECODINGS_IPA ? U"IPA" : U"???"
  182. );
  183. }
  184. static void NUMvector_extendNumberOfElements (integer elementSize, void **v, integer lo, integer *hi, integer extraDemand)
  185. {
  186. try {
  187. byte *result;
  188. if (! *v) {
  189. integer newhi = lo + extraDemand - 1;
  190. result = NUMvector_generic (elementSize, lo, newhi, true);
  191. *hi = newhi;
  192. } else {
  193. integer offset = lo * elementSize;
  194. for (;;) { // not very infinite: 99.999 % of the time once, 0.001 % twice
  195. result = reinterpret_cast <byte *> (Melder_realloc ((char *) *v + offset, (*hi - lo + 1 + extraDemand) * elementSize));
  196. if ((result -= offset)) break; // this will normally succeed at the first try
  197. (void) Melder_realloc_f (result + offset, 1); // ??make "sure" that the second try will succeed
  198. }
  199. (*hi) += extraDemand;
  200. memset (result + *hi * elementSize, 0, elementSize); // initialize the new elements to zeroes
  201. }
  202. *v = result;
  203. } catch (MelderError) {
  204. Melder_throw (U"Vector: size not extended.");
  205. }
  206. }
  207. static void NUMvector_supplyStorage (integer elementSize, void **v, integer lo, integer *hi, integer nfilled, integer extraDemand)
  208. {
  209. integer old_capacity = *hi - lo + 1, new_capacity = nfilled + extraDemand;
  210. if (new_capacity < old_capacity) return;
  211. new_capacity = new_capacity > 2 * old_capacity ? new_capacity : 2 * old_capacity;
  212. NUMvector_extendNumberOfElements (elementSize, v, lo, hi, new_capacity);
  213. }
  214. template <class T>
  215. void NUMvector_supplyStorage (T** v, integer lo, integer *hi, integer nfilled, integer extraDemand) {
  216. NUMvector_supplyStorage (sizeof (T), (void**) v, lo, hi, nfilled, extraDemand);
  217. }
  218. static int synthCallback (short *wav, int numsamples, espeak_EVENT *events)
  219. {
  220. char phoneme_name[9];
  221. if (wav == 0) return 1;
  222. // It is essential that the SpeechSynthesizer is identified here by the user_data,
  223. // because the espeakEVENT_LIST_TERMINATED event may still be accompanied by
  224. // a piece of audio data!!
  225. SpeechSynthesizer me = (SpeechSynthesizer) (events -> user_data);
  226. while (events -> type != espeakEVENT_LIST_TERMINATED) {
  227. if (events -> type == espeakEVENT_SAMPLERATE) {
  228. my d_internalSamplingFrequency = events -> id.number;
  229. } else {
  230. //my events = Table "time type type-t t-pos length a-pos sample id uniq";
  231. // 1 2 3 4 5 6 7 8 9
  232. Table_appendRow (my d_events.get());
  233. integer irow = my d_events -> rows.size;
  234. double time = events -> audio_position * 0.001;
  235. Table_setNumericValue (my d_events.get(), irow, 1, time);
  236. Table_setNumericValue (my d_events.get(), irow, 2, events -> type);
  237. // Column 3 will be filled afterwards
  238. Table_setNumericValue (my d_events.get(), irow, 4, events -> text_position);
  239. Table_setNumericValue (my d_events.get(), irow, 5, events -> length);
  240. Table_setNumericValue (my d_events.get(), irow, 6, events -> audio_position);
  241. Table_setNumericValue (my d_events.get(), irow, 7, events -> sample);
  242. if (events -> type == espeakEVENT_MARK || events -> type == espeakEVENT_PLAY) {
  243. Table_setStringValue (my d_events.get(), irow, 8, Melder_peek8to32 (events -> id.name));
  244. } else {
  245. // Ugly hack because id.string is not 0-terminated if 8 chars long!
  246. memcpy (phoneme_name, events -> id.string, 8);
  247. phoneme_name[8] = 0;
  248. Table_setStringValue (my d_events.get(), irow, 8, Melder_peek8to32 (phoneme_name));
  249. }
  250. Table_setNumericValue (my d_events.get(), irow, 9, events -> unique_identifier);
  251. }
  252. events++;
  253. }
  254. if (me) {
  255. NUMvector_supplyStorage<int> (& my d_wav, 1, & my d_wavCapacity, my d_numberOfSamples, numsamples);
  256. for (integer i = 1; i <= numsamples; i++) {
  257. my d_wav [my d_numberOfSamples + i] = wav [i - 1];
  258. }
  259. my d_numberOfSamples += numsamples;
  260. }
  261. return 0;
  262. }
  263. conststring32 SpeechSynthesizer_getLanguageCode (SpeechSynthesizer me) {
  264. try {
  265. integer irow = Table_searchColumn (espeakdata_languages_propertiesTable.get(), 2, my d_languageName.get());
  266. if (irow == 0) {
  267. Melder_throw (U"Cannot find language \"", my d_languageName.get(), U"\".");
  268. }
  269. return Table_getStringValue_Assert (espeakdata_languages_propertiesTable.get(), irow, 1);
  270. } catch (MelderError) {
  271. Melder_throw (me, U": Cannot find language code.");
  272. }
  273. }
  274. conststring32 SpeechSynthesizer_getPhonemeCode (SpeechSynthesizer me) {
  275. try {
  276. integer irow = Table_searchColumn (espeakdata_languages_propertiesTable.get(), 2, my d_phonemeSet.get());
  277. if (irow == 0) {
  278. Melder_throw (U"Cannot find phoneme set \"", my d_phonemeSet.get(), U"\".");
  279. }
  280. return Table_getStringValue_Assert (espeakdata_languages_propertiesTable.get(), irow, 1);
  281. } catch (MelderError) {
  282. Melder_throw (me, U": Cannot find phoneme code.");
  283. }
  284. }
  285. conststring32 SpeechSynthesizer_getVoiceCode (SpeechSynthesizer me) {
  286. try {
  287. integer irow = Table_searchColumn (espeakdata_voices_propertiesTable.get(), 2, my d_voiceName.get());
  288. if (irow == 0) {
  289. Melder_throw (U": Cannot find voice variant \"", my d_voiceName.get(), U"\".");
  290. }
  291. return Table_getStringValue_Assert (espeakdata_voices_propertiesTable.get(), irow, 1);
  292. } catch (MelderError) {
  293. Melder_throw (me, U": Cannot find voice code.");
  294. }
  295. }
  296. autoSpeechSynthesizer SpeechSynthesizer_create (conststring32 languageName, conststring32 voiceName) {
  297. try {
  298. autoSpeechSynthesizer me = Thing_new (SpeechSynthesizer);
  299. my d_synthesizerVersion = Melder_dup (ESPEAK_NG_VERSION);
  300. my d_languageName = Melder_dup (languageName);
  301. (void) SpeechSynthesizer_getLanguageCode (me.get()); // existence check
  302. my d_voiceName = Melder_dup (voiceName);
  303. (void) SpeechSynthesizer_getVoiceCode (me.get()); // existence check
  304. my d_phonemeSet = Melder_dup (languageName);
  305. SpeechSynthesizer_setTextInputSettings (me.get(), SpeechSynthesizer_INPUT_TEXTONLY, SpeechSynthesizer_PHONEMECODINGS_KIRSHENBAUM);
  306. SpeechSynthesizer_setSpeechOutputSettings (me.get(), 44100.0, 0.01, 1.0, 1.0, 175, SpeechSynthesizer_PHONEMECODINGS_IPA);
  307. SpeechSynthesizer_setEstimateSpeechRateFromSpeech (me.get(), true);
  308. return me;
  309. } catch (MelderError) {
  310. Melder_throw (U"SpeechSynthesizer not created.");
  311. }
  312. }
  313. void SpeechSynthesizer_setTextInputSettings (SpeechSynthesizer me, int inputTextFormat, int inputPhonemeCoding) {
  314. my d_inputTextFormat = inputTextFormat;
  315. my d_inputPhonemeCoding = inputPhonemeCoding;
  316. }
  317. void SpeechSynthesizer_setEstimateSpeechRateFromSpeech (SpeechSynthesizer me, bool estimate) {
  318. my d_estimateSpeechRate = estimate;
  319. }
  320. void SpeechSynthesizer_setSpeechOutputSettings (SpeechSynthesizer me, double samplingFrequency, double wordgap, double pitchAdjustment, double pitchRange, double wordsPerMinute, int outputPhonemeCoding) {
  321. my d_samplingFrequency = samplingFrequency;
  322. my d_wordgap = wordgap;
  323. pitchAdjustment = pitchAdjustment < 0.5 ? 0.5 : (pitchAdjustment > 2.0 ? 2.0 : pitchAdjustment);
  324. my d_pitchAdjustment = pitchAdjustment;
  325. pitchRange = pitchRange < 0.0 ? 0.0 : (pitchRange > 2.0 ? 2.0 : pitchRange);
  326. my d_pitchRange = pitchRange;
  327. if (wordsPerMinute <= 0.0) wordsPerMinute = 175.0;
  328. if (wordsPerMinute > 450.0) wordsPerMinute = 450.0;
  329. if (wordsPerMinute < 80.0) wordsPerMinute = 80.0;
  330. my d_wordsPerMinute = wordsPerMinute;
  331. my d_outputPhonemeCoding = outputPhonemeCoding;
  332. }
  333. void SpeechSynthesizer_playText (SpeechSynthesizer me, conststring32 text) {
  334. autoSound thee = SpeechSynthesizer_to_Sound (me, text, nullptr, nullptr);
  335. Sound_play (thee.get(), nullptr, nullptr);
  336. }
  337. static autoSound buffer_to_Sound (int *wav, integer numberOfSamples, double samplingFrequency)
  338. {
  339. try {
  340. double dx = 1.0 / samplingFrequency;
  341. double xmax = numberOfSamples * dx;
  342. autoSound thee = Sound_create (1, 0.0, xmax, numberOfSamples, dx, dx / 2.0);
  343. for (integer i = 1; i <= numberOfSamples; i++) {
  344. thy z[1][i] = wav[i] / 32768.0;
  345. }
  346. return thee;
  347. } catch (MelderError) {
  348. Melder_throw (U"Sound not created from synthesizer data.");
  349. }
  350. }
  351. static void IntervalTier_addBoundaryUnsorted (IntervalTier me, integer iinterval, double time, conststring32 newLabel, bool isNewleftLabel) {
  352. if (time <= my xmin || time >= my xmax) {
  353. Melder_throw (U"Time is outside interval domains.");
  354. }
  355. // Find interval to split
  356. if (iinterval <= 0) {
  357. iinterval = IntervalTier_timeToLowIndex (me, time);
  358. }
  359. // Modify end time of left label
  360. TextInterval ti = my intervals.at [iinterval];
  361. ti -> xmax = time;
  362. if (isNewleftLabel) TextInterval_setText (ti, newLabel);
  363. autoTextInterval ti_new = TextInterval_create (time, my xmax, (! isNewleftLabel ? newLabel : U"" ));
  364. my intervals. addItem_unsorted_move (ti_new.move());
  365. }
  366. static void Table_setEventTypeString (Table me) {
  367. try {
  368. for (integer i = 1; i <= my rows.size; i ++) {
  369. int type = Table_getNumericValue_Assert (me, i, 2);
  370. conststring32 label = U"0";
  371. if (type == espeakEVENT_WORD) {
  372. label = U"word";
  373. } else if (type == espeakEVENT_SENTENCE) {
  374. label = U"sent";
  375. } else if (type == espeakEVENT_MARK) {
  376. label = U"mark";
  377. } else if (type == espeakEVENT_PLAY) {
  378. label = U"play";
  379. } else if (type == espeakEVENT_END) {
  380. label = U"s-end";
  381. } else if (type == espeakEVENT_MSG_TERMINATED) {
  382. label = U"msg_term";
  383. } else if (type == espeakEVENT_PHONEME) {
  384. label = U"phoneme";
  385. }
  386. Table_setStringValue (me, i, 3, label);
  387. }
  388. } catch (MelderError) {
  389. Melder_throw (U"Event types not set.");
  390. }
  391. }
  392. static void MelderString_trimWhiteSpaceAtEnd (MelderString *me) {
  393. while (my length > 1 && (my string [my length - 1] == U' ' || my string [my length - 1] == U'\t'
  394. || my string [my length - 1] == U'\r' || my string [my length - 1] == U'\n'))
  395. {
  396. my string [my length - 1] = U'\0';
  397. my length--;
  398. }
  399. }
  400. static void IntervalTier_mergeSpecialIntervals (IntervalTier me) {
  401. integer intervalIndex = my intervals.size;
  402. TextInterval right = my intervals.at [intervalIndex];
  403. integer labelLength_right = TextInterval_labelLength (right);
  404. bool isEmptyInterval_right = labelLength_right == 0 || (labelLength_right == 1 && Melder_equ (right -> text.get(), U"\001"));
  405. while (intervalIndex > 1) {
  406. TextInterval left = my intervals.at [intervalIndex - 1];
  407. integer labelLength_left = TextInterval_labelLength (left);
  408. bool isEmptyInterval_left = labelLength_left == 0 || (labelLength_left == 1 && Melder_equ (left -> text.get(), U"\001"));
  409. if (isEmptyInterval_right && isEmptyInterval_left) {
  410. // remove right interval and empty left interval
  411. left -> xmax = right -> xmax;
  412. TextInterval_setText (left, U"");
  413. my intervals. removeItem (intervalIndex);
  414. }
  415. right = left;
  416. isEmptyInterval_right = isEmptyInterval_left;
  417. intervalIndex --;
  418. }
  419. }
  420. #if 0
  421. /* insert boundary at time t and merge/delete intervals after this time */
  422. static void IntervalTier_insertBoundaryAndMergeIntervalsAfter (IntervalTier me, double t) {
  423. if (t <= my xmin || t >= my xmax) {
  424. return;
  425. }
  426. integer intervalNumber = IntervalTier_timeToLowIndex (me, t);
  427. while (my intervals.size > intervalNumber + 1) {
  428. my intervals. removeItem (my intervals.size);
  429. }
  430. // there can be maximally one interval left to the right of intervalNumber
  431. TextInterval ti = my intervals.at [intervalNumber];
  432. if (ti -> xmin == t) { // if t happens to be on a boundary: remove the next interval if it exists
  433. if (my intervals.size > intervalNumber) {
  434. my intervals. removeItem (my intervals .size);
  435. }
  436. ti -> xmax = my xmax;
  437. TextInterval_setText (ti, U"");
  438. } else {
  439. ti -> xmax = t;
  440. TextInterval last = my intervals.at [my intervals.size];
  441. last -> xmin = t;
  442. last -> xmax = my xmax;
  443. TextInterval_setText (last, U"");
  444. }
  445. }
  446. #endif
  447. static bool almost_equal (double t1, double t2) {
  448. // the "=" sign is essential for a difference of zero if t1 == 0
  449. return fabs (t1 - t2) <= 1e-12 * fabs (t1);
  450. }
  451. static void IntervalTier_insertEmptyIntervalsFromOtherTier (IntervalTier to, IntervalTier from) {
  452. for (integer iint = 1; iint <= from -> intervals.size; iint ++) {
  453. TextInterval tifrom = from -> intervals.at [iint];
  454. if (TextInterval_labelLength (tifrom) == 0) { // found empty interval
  455. double t_left = tifrom -> xmin, t_right = tifrom -> xmax;
  456. integer intervalIndex_to = IntervalTier_timeToLowIndex (to, t_left);
  457. if (intervalIndex_to > 0) { // insert to the right of intervalIndex_to
  458. TextInterval tito = to -> intervals.at [intervalIndex_to];
  459. if (! almost_equal (tito -> xmin, t_left)) { // not on the start boundary of the interval, it cannot be at xmax
  460. autoTextInterval newInterval = TextInterval_create (t_left, tito -> xmax, U"");
  461. tito -> xmax = t_left;
  462. to -> intervals. addItem_move (newInterval.move());
  463. }
  464. }
  465. intervalIndex_to = IntervalTier_timeToHighIndex (to, t_right);
  466. TextInterval tito = to -> intervals.at [intervalIndex_to];
  467. if (intervalIndex_to > 0) {
  468. if (! almost_equal (t_right, tito -> xmax)) { // insert to the left of intervalIndex_to
  469. autoTextInterval newInterval = TextInterval_create (tito -> xmin, t_right, U"");
  470. tito -> xmin = t_right;
  471. to -> intervals. addItem_move (newInterval.move());
  472. }
  473. }
  474. }
  475. }
  476. }
  477. static void IntervalTier_removeVeryShortIntervals (IntervalTier me) {
  478. integer iint = 1;
  479. while (iint <= my intervals.size) {
  480. TextInterval ti = my intervals.at [iint];
  481. if (almost_equal (ti -> xmin, ti -> xmax)) {
  482. my intervals.removeItem (iint);
  483. } else {
  484. iint ++;
  485. }
  486. }
  487. }
  488. static autoTextGrid Table_to_TextGrid (Table me, conststring32 text, double xmin, double xmax) {
  489. //Table_createWithColumnNames (0, L"time type type-t t-pos length a-pos sample id uniq");
  490. try {
  491. integer length, textLength = str32len (text);
  492. integer numberOfRows = my rows.size;
  493. integer timeColumnIndex = Table_getColumnIndexFromColumnLabel (me, U"time");
  494. integer typeColumnIndex = Table_getColumnIndexFromColumnLabel (me, U"type");
  495. integer tposColumnIndex = Table_getColumnIndexFromColumnLabel (me, U"t-pos");
  496. integer idColumnIndex = Table_getColumnIndexFromColumnLabel (me, U"id");
  497. autoTextGrid thee = TextGrid_create (xmin, xmax, U"sentence clause word phoneme", U"");
  498. TextGrid_setIntervalText (thee.get(), 1, 1, text);
  499. integer p1c = 1, p1w = 1;
  500. double time_phon_p = xmin;
  501. bool wordEnd = false;
  502. autoMelderString mark;
  503. IntervalTier clauses = (IntervalTier) thy tiers->at [2];
  504. IntervalTier words = (IntervalTier) thy tiers->at [3];
  505. IntervalTier phonemes = (IntervalTier) thy tiers->at [4];
  506. for (integer i = 1; i <= numberOfRows; i++) {
  507. double time = Table_getNumericValue_Assert (me, i, timeColumnIndex);
  508. int type = Table_getNumericValue_Assert (me, i, typeColumnIndex);
  509. integer pos = Table_getNumericValue_Assert (me, i, tposColumnIndex);
  510. if (type == espeakEVENT_SENTENCE) {
  511. // Only insert a new boundary, no text
  512. // text will be inserted at end sentence event
  513. if (time > xmin && time < xmax) {
  514. IntervalTier_addBoundaryUnsorted (clauses, clauses -> intervals.size, time, U"", true);
  515. }
  516. p1c = pos;
  517. } else if (type == espeakEVENT_END) {
  518. // End of clause: insert new boundary, and fill left interval with text
  519. length = pos - p1c + 1;
  520. MelderString_ncopy (&mark, text + p1c - 1, length);
  521. MelderString_trimWhiteSpaceAtEnd (& mark);
  522. if (time > xmin && time < xmax) {
  523. IntervalTier_addBoundaryUnsorted (clauses, clauses -> intervals.size, time, mark.string, true);
  524. } else {
  525. TextGrid_setIntervalText (thee.get(), 2, clauses -> intervals.size, mark.string);
  526. }
  527. p1c = pos;
  528. // End of clause always signals "end of a word"
  529. if (pos <= textLength) {
  530. length = pos - p1w + 1;
  531. MelderString_ncopy (&mark, text + p1w - 1, length);
  532. MelderString_trimWhiteSpaceAtEnd (& mark);
  533. if (time > xmin && time < xmax) {
  534. IntervalTier_addBoundaryUnsorted (words, words -> intervals.size, time, mark.string, true);
  535. } else {
  536. TextGrid_setIntervalText (thee.get(), 3, words -> intervals.size, mark.string);
  537. }
  538. // now the next word event should not trigger setting the left interval text
  539. wordEnd = false;
  540. }
  541. } else if (type == espeakEVENT_WORD) {
  542. if (pos < p1w) {
  543. continue;
  544. }
  545. if (time > xmin && time < xmax) {
  546. length = pos - p1w;
  547. if (pos == textLength) {
  548. length++;
  549. }
  550. MelderString_ncopy (&mark, text + p1w - 1, length);
  551. MelderString_trimWhiteSpaceAtEnd (& mark);
  552. IntervalTier_addBoundaryUnsorted (words, words -> intervals.size, time, ( wordEnd ? mark.string : U"" ), true);
  553. }
  554. wordEnd = true;
  555. p1w = pos;
  556. } else if (type == espeakEVENT_PHONEME) {
  557. conststring32 id = Table_getStringValue_Assert (me, i, idColumnIndex);
  558. if (time > time_phon_p) {
  559. // Insert new boudary and label interval with the id
  560. // TODO: Translate the id to the correct notation
  561. TextInterval ti = phonemes -> intervals.at [phonemes -> intervals.size];
  562. if (time > ti -> xmin && time < ti -> xmax) {
  563. IntervalTier_addBoundaryUnsorted (phonemes, phonemes -> intervals.size, time, id, false);
  564. }
  565. } else {
  566. // Just in case the phoneme starts at xmin we only need to set interval text
  567. TextGrid_setIntervalText (thee.get(), 4, phonemes -> intervals.size, id);
  568. }
  569. time_phon_p = time;
  570. }
  571. }
  572. clauses -> intervals. sort ();
  573. words -> intervals. sort ();
  574. phonemes -> intervals. sort ();
  575. IntervalTier_mergeSpecialIntervals (phonemes); // Merge neighbouring empty U"" and U"\001" intervals
  576. IntervalTier_removeVeryShortIntervals (words);
  577. IntervalTier_removeVeryShortIntervals (clauses);
  578. /* Use empty intervals in phoneme tier for more precision in the word tier */
  579. IntervalTier_insertEmptyIntervalsFromOtherTier (words, phonemes);
  580. IntervalTier_mergeSpecialIntervals (words); // Merge neighbouring empty U"" and U"\001" intervals
  581. return thee;
  582. } catch (MelderError) {
  583. Melder_throw (U"TextGrid not created from Table with events.");
  584. }
  585. }
  586. static void espeakdata_SetVoiceByName (conststring32 languageName, conststring32 voiceName)
  587. {
  588. espeak_VOICE voice_selector;
  589. memset (& voice_selector, 0, sizeof voice_selector);
  590. voice_selector.name = Melder_peek32to8 (Melder_cat (languageName, U"+", voiceName)); // include variant name in voice stack ??
  591. if (LoadVoice (Melder_peek32to8 (languageName), 1)) {
  592. LoadVoice (Melder_peek32to8 (voiceName), 2);
  593. DoVoiceChange (voice);
  594. SetVoiceStack (& voice_selector, Melder_peek32to8 (voiceName));
  595. }
  596. }
  597. autoSound SpeechSynthesizer_to_Sound (SpeechSynthesizer me, conststring32 text, autoTextGrid *tg, autoTable *events) {
  598. try {
  599. espeak_ng_InitializePath (nullptr); // PATH_ESPEAK_DATA
  600. espeak_ng_ERROR_CONTEXT context = { 0 };
  601. espeak_ng_STATUS status = espeak_ng_Initialize (& context);
  602. if (status != ENS_OK) {
  603. Melder_throw (U"Internal espeak error.", status);
  604. }
  605. int synth_flags = espeakCHARS_WCHAR;
  606. if (my d_inputTextFormat == SpeechSynthesizer_INPUT_TAGGEDTEXT) {
  607. synth_flags |= espeakSSML;
  608. }
  609. if (my d_inputTextFormat != SpeechSynthesizer_INPUT_TEXTONLY) {
  610. synth_flags |= espeakPHONEMES;
  611. }
  612. option_phoneme_events = espeakINITIALIZE_PHONEME_EVENTS; // extern int option_phoneme_events;
  613. if (my d_outputPhonemeCoding == SpeechSynthesizer_PHONEMECODINGS_IPA) {
  614. option_phoneme_events |= espeakINITIALIZE_PHONEME_IPA;
  615. }
  616. espeak_ng_SetParameter (espeakRATE, my d_wordsPerMinute, 0);
  617. /*
  618. pitchAdjustment_0_99 = a * log10 (my d_pitchAdjustment) + b,
  619. where 0.5 <= my d_pitchAdjustment <= 2
  620. pitchRange_0_99 = my d_pitchRange * 49.5,
  621. where 0 <= my d_pitchRange <= 2
  622. */
  623. int pitchAdjustment_0_99 = (int) ((49.5 / log10(2.0)) * log10 (my d_pitchAdjustment) + 49.5); // rounded towards zero
  624. espeak_ng_SetParameter (espeakPITCH, pitchAdjustment_0_99, 0);
  625. int pitchRange_0_99 = (int) (my d_pitchRange * 49.5); // rounded towards zero
  626. espeak_ng_SetParameter (espeakRANGE, pitchRange_0_99, 0);
  627. conststring32 languageCode = SpeechSynthesizer_getLanguageCode (me);
  628. conststring32 voiceCode = SpeechSynthesizer_getVoiceCode (me);
  629. espeak_ng_SetVoiceByName(Melder_peek32to8 (Melder_cat (languageCode, U"+", voiceCode)));
  630. int wordgap_10ms = my d_wordgap * 100; // espeak wordgap is in units of 10 ms
  631. espeak_ng_SetParameter (espeakWORDGAP, wordgap_10ms, 0);
  632. espeak_ng_SetParameter (espeakCAPITALS, 0, 0);
  633. espeak_ng_SetParameter (espeakPUNCTUATION, espeakPUNCT_NONE, 0);
  634. status = espeak_ng_InitializeOutput (ENOUTPUT_MODE_SYNCHRONOUS, 2048, nullptr);
  635. espeak_SetSynthCallback (synthCallback);
  636. if (! Melder_equ (my d_phonemeSet.get(), my d_languageName.get())) {
  637. conststring32 phonemeCode = SpeechSynthesizer_getPhonemeCode (me);
  638. int index_phon_table_list = LookupPhonemeTable (Melder_peek32to8 (phonemeCode));
  639. if (index_phon_table_list > 0) {
  640. voice -> phoneme_tab_ix = index_phon_table_list;
  641. DoVoiceChange(voice);
  642. }
  643. }
  644. my d_events = Table_createWithColumnNames (0, U"time type type-t t-pos length a-pos sample id uniq");
  645. #ifdef _WIN32
  646. conststringW textW = Melder_peek32toW (text);
  647. espeak_ng_Synthesize (textW, wcslen (textW) + 1, 0, POS_CHARACTER, 0, synth_flags, nullptr, me);
  648. #else
  649. espeak_ng_Synthesize (text, str32len (text) + 1, 0, POS_CHARACTER, 0, synth_flags, nullptr, me);
  650. #endif
  651. espeak_ng_Terminate ();
  652. autoSound thee = buffer_to_Sound (my d_wav, my d_numberOfSamples, my d_internalSamplingFrequency);
  653. if (my d_samplingFrequency != my d_internalSamplingFrequency) {
  654. thee = Sound_resample (thee.get(), my d_samplingFrequency, 50);
  655. }
  656. my d_numberOfSamples = 0; // re-use the wav-buffer
  657. if (tg) {
  658. double xmin = Table_getNumericValue_Assert (my d_events.get(), 1, 1);
  659. if (xmin > thy xmin) {
  660. xmin = thy xmin;
  661. }
  662. double xmax = Table_getNumericValue_Assert (my d_events.get(), my d_events -> rows.size, 1);
  663. if (xmax < thy xmax) {
  664. xmax = thy xmax;
  665. }
  666. autoTextGrid tg1 = Table_to_TextGrid (my d_events.get(), text, xmin, xmax);
  667. *tg = TextGrid_extractPart (tg1.get(), thy xmin, thy xmax, 0);
  668. }
  669. if (events) {
  670. Table_setEventTypeString (my d_events.get());
  671. *events = my d_events.move();
  672. }
  673. my d_events.reset();
  674. return thee;
  675. } catch (MelderError) {
  676. espeak_Terminate ();
  677. Melder_throw (U"Text not played.");
  678. }
  679. }
  680. /* End of file SpeechSynthesizer.cpp */