espeak-sg.cpp 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. #include <espeak-ng/espeak_ng.h>
  2. #define PLAYBACK_MODE (ENOUTPUT_MODE_SYNCHRONOUS | ENOUTPUT_MODE_SPEAK_AUDIO)
  3. #include <vector>
  4. #include <string>
  5. #include <sndfile.h>
  6. #include <math.h>
  7. #include <string.h>
  8. #include <fstream>
  9. #include <map>
  10. #include <boost/lexical_cast.hpp>
  11. #include <boost/algorithm/string/classification.hpp>
  12. #include <boost/algorithm/string/split.hpp>
  13. struct segment
  14. {
  15. char type;
  16. int start;
  17. int length;
  18. };
  19. struct pho_event
  20. {
  21. std::string code;
  22. int type;
  23. int start;
  24. };
  25. struct output_data
  26. {
  27. // for each run
  28. std::vector<short int> samples;
  29. std::vector<segment> segments;
  30. std::vector<pho_event> pho;
  31. char segment_type = 0;
  32. int segment_pos = 0;
  33. float length;
  34. };
  35. struct output_data* current;
  36. struct output_data* last = nullptr;
  37. std::vector<output_data*> runs;
  38. std::map<std::string,std::string> mbrolaMap;
  39. void init()
  40. {
  41. espeak_ng_InitializePath(NULL);
  42. espeak_ng_ERROR_CONTEXT context = NULL;
  43. espeak_ng_Initialize(&context);
  44. espeak_ng_InitializeOutput(ENOUTPUT_MODE_SYNCHRONOUS, 0, NULL); //FIX https://notabug.org/isengaara/sekai/issues/4
  45. }
  46. void segment_event(char type)
  47. {
  48. if(current->segment_type!=type)
  49. {
  50. if(current->segment_type!=0)
  51. {
  52. segment s;
  53. s.type = current->segment_type;
  54. s.start = current->segment_pos;
  55. s.length = current->samples.size()-current->segment_pos;
  56. current->segments.push_back(s);
  57. }
  58. current->segment_type = type;
  59. current->segment_pos = current->samples.size();
  60. }
  61. }
  62. void outputPhoSymbol(char* pho_code,int pho_type)
  63. {
  64. pho_event p;
  65. p.code = pho_code;
  66. p.type = pho_type;
  67. p.start = current->samples.size();
  68. current->pho.push_back(p);
  69. }
  70. void outputSilence(short int sample)
  71. {
  72. segment_event('S');
  73. current->samples.push_back(sample);
  74. }
  75. void outputUnvoiced(short int sample)
  76. {
  77. segment_event('U');
  78. current->samples.push_back(sample);
  79. }
  80. void outputVoiced(short int sample)
  81. {
  82. segment_event('V');
  83. current->samples.push_back(sample);
  84. }
  85. void flush()
  86. {
  87. outputPhoSymbol((char*)"#",0);
  88. segment_event(0);
  89. }
  90. void write_zstring(FILE* f,std::string s)
  91. {
  92. fwrite(s.c_str(),1,s.length()+1,f);
  93. }
  94. void write_int(FILE* f,int n)
  95. {
  96. fwrite(&n,1,sizeof(int),f);
  97. }
  98. void write_short(FILE* f,short n)
  99. {
  100. fwrite(&n,1,sizeof(short),f);
  101. }
  102. void write_char(FILE* f,char n)
  103. {
  104. fwrite(&n,1,sizeof(char),f);
  105. }
  106. void write_float(FILE* f,float n)
  107. {
  108. fwrite(&n,1,sizeof(float),f);
  109. }
  110. void do_synth(int rate,int f0,char* lyric)
  111. {
  112. // synth with param
  113. espeak_SetParameter(espeakRATE, rate, 0);
  114. //synth
  115. espeak_ng_SetConstF0(f0);
  116. espeak_ng_Synthesize(lyric, 0, 0, POS_CHARACTER, 0, 0, NULL, NULL);
  117. flush();
  118. }
  119. void show_length(float fs)
  120. {
  121. float count = current->samples.size();
  122. for (uint i = 0; i<current->pho.size()-1; i++)
  123. {
  124. auto pho = current->pho[i].code;
  125. auto pos = current->pho[i].start;
  126. //printf("%s %i\n",pho.c_str(),pos);
  127. if(pho[0]=='_')
  128. {
  129. float length = pos/fs;
  130. current->length = length;
  131. return;
  132. }
  133. }
  134. float length = count/fs;
  135. current->length = length;
  136. }
  137. void find_best_one(float note_length)
  138. {
  139. bool found = false;
  140. for(size_t i=0; i<runs.size()-1; i++)
  141. {
  142. float delta0 = fabs(runs[i]->length - note_length);
  143. float delta1 = fabs(runs[i+1]->length - note_length);
  144. //printf("find best one %i %f %f",(int)i,delta0,delta1);
  145. if(!found && delta1 > delta0)
  146. {
  147. //printf(" found");
  148. current = runs[i];
  149. found = true;
  150. }
  151. //printf("\n");
  152. }
  153. }
  154. void load_mbrola_table(std::string fileName) {
  155. std::ifstream infile(fileName);
  156. std::string line;
  157. while (std::getline(infile, line)) {
  158. std::vector<std::string> spl;
  159. boost::split(spl, line, boost::is_any_of("\t "),
  160. boost::token_compress_on);
  161. mbrolaMap[spl[0]] = spl[1];
  162. }
  163. }
  164. int main(int argc,char** argv)
  165. {
  166. if(argc<6)
  167. {
  168. printf("usage: espeak-sg voice f0 lyric rate filename [optargs..]\n");
  169. return 0;
  170. }
  171. char* voice = argv[1];
  172. //printf("espeak::voice=%s\n",voice);
  173. int f0 = atoi(argv[2]);
  174. char* lyric = argv[3];
  175. int rate = atoi(argv[4]);
  176. char* filename = argv[5];
  177. bool have_note_length = false;
  178. float note_length = 0;
  179. bool mbrola = false;
  180. for(int i=6;i<argc;i++)
  181. {
  182. char* optarg = argv[i];
  183. if(strlen(optarg)>=4)
  184. {
  185. if(optarg[0]=='n' && optarg[1]=='l' && optarg[2]=='=')
  186. {
  187. note_length = boost::lexical_cast<float>(optarg+3);
  188. have_note_length = true;
  189. }
  190. if(optarg[0]=='m' && optarg[1]=='b' && optarg[2]=='=')
  191. {
  192. mbrola=true;
  193. load_mbrola_table(optarg+3);
  194. }
  195. }
  196. }
  197. init();
  198. espeak_ng_SetVoiceByName(voice);
  199. espeak_ng_OUTPUT_HOOKS hooks;
  200. hooks.outputPhoSymbol = outputPhoSymbol;
  201. hooks.outputSilence = outputSilence;
  202. hooks.outputUnvoiced = outputUnvoiced;
  203. hooks.outputVoiced = outputVoiced;
  204. espeak_ng_SetOutputHooks(&hooks);
  205. float samplerate = (float)espeak_ng_GetSampleRate();
  206. if(rate!=0)
  207. {
  208. current = new output_data;
  209. do_synth(rate,f0,lyric);
  210. }
  211. else
  212. {
  213. if(!have_note_length)
  214. {
  215. fprintf(stderr,"note length required\n");
  216. return 1;
  217. }
  218. #if 0
  219. SF_INFO info;
  220. info.samplerate = espeak_ng_GetSampleRate();
  221. info.channels = 1;
  222. info.format = SF_FORMAT_WAV | SF_FORMAT_PCM_16;
  223. info.sections = 0;
  224. info.frames = 0;
  225. info.seekable = 0;
  226. SNDFILE* sf = sf_open("/tmp/debug.wav",SFM_WRITE,&info);
  227. #endif
  228. for(int current_rate = 80; current_rate < 450; current_rate += 5)
  229. {
  230. //printf("set rate %i\n",current_rate);
  231. current = new output_data;
  232. do_synth(current_rate,f0,lyric);
  233. show_length(samplerate);
  234. runs.push_back(current);
  235. last = current;
  236. }
  237. find_best_one(note_length);
  238. #if 0
  239. sf_write_short(sf,current->samples.data(),current->samples.size());
  240. sf_close(sf);
  241. #endif
  242. }
  243. if(mbrola)
  244. {
  245. FILE* f = fopen(filename,"w");
  246. ///if(have_note_length) fprintf(f,"#nl=%f\n",note_length);
  247. for (uint i = 0; i<current->pho.size()-1; i++)
  248. {
  249. auto pho = current->pho[i].code;
  250. if(pho[0]=='_') break;
  251. if (mbrolaMap.count(pho))
  252. {
  253. pho = mbrolaMap[pho];
  254. }
  255. float t = (current->pho[i+1].start + current->pho[i].start) * 1.0 / samplerate;
  256. char* ototypes[]= {"PAUSE","STRESS","VOWEL","LIQUID","STOP","VSTOP","FRICATIVE","VFRICATIVE","NASAL","VIRTUAL","DELETED","INVALID"};
  257. fprintf(f,"%s %s %f",pho.c_str(),ototypes[current->pho[i].type],t);
  258. if(last)
  259. {
  260. float t_min = (last->pho[i+1].start + last->pho[i].start) * 1.0 / samplerate;
  261. fprintf(f," %f",t_min);
  262. }
  263. fprintf(f,"\n");
  264. }
  265. fclose(f);
  266. return 0;
  267. }
  268. FILE* f = fopen(filename,"w");
  269. write_zstring(f,"espeak-sg");
  270. write_int(f,0); //version of the file format
  271. write_int(f,samplerate);
  272. write_int(f,f0);
  273. write_int(f,current->pho.size());
  274. for (auto i : current->pho)
  275. {
  276. write_zstring(f,i.code);
  277. write_int(f,i.type);
  278. write_int(f,i.start);
  279. }
  280. write_int(f,current->segments.size());
  281. for (auto i : current->segments)
  282. {
  283. write_char(f, i.type);
  284. write_int(f, i.start);
  285. write_int(f, i.length);
  286. }
  287. write_int(f,current->samples.size());
  288. fwrite(current->samples.data(),current->samples.size(),sizeof(short),f);
  289. fclose(f);
  290. return 0;
  291. }