espeak-sg.cpp 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. #include <espeak-ng/espeak_ng.h>
  2. #define PLAYBACK_MODE (ENOUTPUT_MODE_SYNCHRONOUS | ENOUTPUT_MODE_SPEAK_AUDIO)
  3. #include <vector>
  4. #include <string>
  5. #include <sndfile.h>
  6. #include <math.h>
  7. #include <string.h>
  8. #include <boost/lexical_cast.hpp>
  9. struct segment
  10. {
  11. char type;
  12. int start;
  13. int length;
  14. };
  15. struct pho_event
  16. {
  17. std::string code;
  18. int type;
  19. int start;
  20. };
  21. struct output_data
  22. {
  23. // for each run
  24. std::vector<short int> samples;
  25. std::vector<segment> segments;
  26. std::vector<pho_event> pho;
  27. char segment_type = 0;
  28. int segment_pos = 0;
  29. float length;
  30. };
  31. struct output_data* current;
  32. std::vector<output_data*> runs;
  33. void init()
  34. {
  35. espeak_ng_InitializePath(NULL);
  36. espeak_ng_ERROR_CONTEXT context = NULL;
  37. espeak_ng_Initialize(&context);
  38. espeak_ng_InitializeOutput(ENOUTPUT_MODE_SYNCHRONOUS, 0, NULL); //FIX https://notabug.org/isengaara/sekai/issues/4
  39. }
  40. void segment_event(char type)
  41. {
  42. if(current->segment_type!=type)
  43. {
  44. if(current->segment_type!=0)
  45. {
  46. segment s;
  47. s.type = current->segment_type;
  48. s.start = current->segment_pos;
  49. s.length = current->samples.size()-current->segment_pos;
  50. current->segments.push_back(s);
  51. }
  52. current->segment_type = type;
  53. current->segment_pos = current->samples.size();
  54. }
  55. }
  56. void outputPhoSymbol(char* pho_code,int pho_type)
  57. {
  58. pho_event p;
  59. p.code = pho_code;
  60. p.type = pho_type;
  61. p.start = current->samples.size();
  62. current->pho.push_back(p);
  63. }
  64. void outputSilence(short int sample)
  65. {
  66. segment_event('S');
  67. current->samples.push_back(sample);
  68. }
  69. void outputUnvoiced(short int sample)
  70. {
  71. segment_event('U');
  72. current->samples.push_back(sample);
  73. }
  74. void outputVoiced(short int sample)
  75. {
  76. segment_event('V');
  77. current->samples.push_back(sample);
  78. }
  79. void flush()
  80. {
  81. outputPhoSymbol((char*)"#",0);
  82. segment_event(0);
  83. }
  84. void write_zstring(FILE* f,std::string s)
  85. {
  86. fwrite(s.c_str(),1,s.length()+1,f);
  87. }
  88. void write_int(FILE* f,int n)
  89. {
  90. fwrite(&n,1,sizeof(int),f);
  91. }
  92. void write_short(FILE* f,short n)
  93. {
  94. fwrite(&n,1,sizeof(short),f);
  95. }
  96. void write_char(FILE* f,char n)
  97. {
  98. fwrite(&n,1,sizeof(char),f);
  99. }
  100. void write_float(FILE* f,float n)
  101. {
  102. fwrite(&n,1,sizeof(float),f);
  103. }
  104. void do_synth(int rate,int f0,char* lyric)
  105. {
  106. // synth with param
  107. espeak_SetParameter(espeakRATE, rate, 0);
  108. //synth
  109. espeak_ng_SetConstF0(f0);
  110. espeak_ng_Synthesize(lyric, 0, 0, POS_CHARACTER, 0, 0, NULL, NULL);
  111. flush();
  112. }
  113. void show_length(float fs)
  114. {
  115. float count = current->samples.size();
  116. float length = count/fs;
  117. current->length = length;
  118. }
  119. void find_best_one(float note_length)
  120. {
  121. bool found = false;
  122. for(size_t i=0; i<runs.size()-1; i++)
  123. {
  124. float delta0 = fabs(runs[i]->length - note_length);
  125. float delta1 = fabs(runs[i+1]->length - note_length);
  126. //printf("find best one %i %f %f",(int)i,delta0,delta1);
  127. if(!found && delta1 > delta0)
  128. {
  129. //printf(" found");
  130. current = runs[i];
  131. found = true;
  132. }
  133. //printf("\n");
  134. }
  135. }
  136. int main(int argc,char** argv)
  137. {
  138. if(argc<6)
  139. {
  140. printf("usage: espeak-sg voice f0 lyric rate filename [optargs..]\n");
  141. return 0;
  142. }
  143. char* voice = argv[1];
  144. int f0 = atoi(argv[2]);
  145. char* lyric = argv[3];
  146. int rate = atoi(argv[4]);
  147. char* filename = argv[5];
  148. bool have_note_length = false;
  149. float note_length = 0;
  150. for(int i=6;i<argc;i++)
  151. {
  152. char* optarg = argv[i];
  153. if(strlen(optarg)>4)
  154. {
  155. if(optarg[0]=='n' && optarg[1]=='l' && optarg[2]=='=')
  156. {
  157. note_length = boost::lexical_cast<float>(optarg+3);
  158. have_note_length = true;
  159. }
  160. }
  161. }
  162. init();
  163. espeak_ng_SetVoiceByName(voice);
  164. espeak_ng_OUTPUT_HOOKS hooks;
  165. hooks.outputPhoSymbol = outputPhoSymbol;
  166. hooks.outputSilence = outputSilence;
  167. hooks.outputUnvoiced = outputUnvoiced;
  168. hooks.outputVoiced = outputVoiced;
  169. espeak_ng_SetOutputHooks(&hooks);
  170. float samplerate = (float)espeak_ng_GetSampleRate();
  171. if(rate!=0)
  172. {
  173. current = new output_data;
  174. do_synth(rate,f0,lyric);
  175. }
  176. else
  177. {
  178. if(!have_note_length)
  179. {
  180. fprintf(stderr,"note length required\n");
  181. return 1;
  182. }
  183. #if 0
  184. SF_INFO info;
  185. info.samplerate = espeak_ng_GetSampleRate();
  186. info.channels = 1;
  187. info.format = SF_FORMAT_WAV | SF_FORMAT_PCM_16;
  188. info.sections = 0;
  189. info.frames = 0;
  190. info.seekable = 0;
  191. SNDFILE* sf = sf_open("/tmp/debug.wav",SFM_WRITE,&info);
  192. #endif
  193. for(int current_rate = 80; current_rate < 450; current_rate += 5)
  194. {
  195. //printf("set rate %i\n",current_rate);
  196. current = new output_data;
  197. do_synth(current_rate,f0,lyric);
  198. show_length(samplerate);
  199. runs.push_back(current);
  200. }
  201. find_best_one(note_length);
  202. #if 0
  203. sf_write_short(sf,current->samples.data(),current->samples.size());
  204. sf_close(sf);
  205. #endif
  206. }
  207. FILE* f = fopen(filename,"w");
  208. write_zstring(f,"espeak-sg");
  209. write_int(f,0); //version of the file format
  210. write_int(f,samplerate);
  211. write_int(f,f0);
  212. write_int(f,current->pho.size());
  213. for (auto i : current->pho)
  214. {
  215. write_zstring(f,i.code);
  216. write_int(f,i.type);
  217. write_int(f,i.start);
  218. }
  219. write_int(f,current->segments.size());
  220. for (auto i : current->segments)
  221. {
  222. write_char(f, i.type);
  223. write_int(f, i.start);
  224. write_int(f, i.length);
  225. }
  226. write_int(f,current->samples.size());
  227. fwrite(current->samples.data(),current->samples.size(),sizeof(short),f);
  228. fclose(f);
  229. return 0;
  230. }