isengaara
/
sekai


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
							#include <espeak-ng/espeak_ng.h>
#define PLAYBACK_MODE (ENOUTPUT_MODE_SYNCHRONOUS | ENOUTPUT_MODE_SPEAK_AUDIO)

#include <vector>
#include <string>
#include <sndfile.h>
#include <math.h>
#include <string.h>
#include <fstream>

#include <boost/lexical_cast.hpp>
#include <boost/algorithm/string/classification.hpp>
#include <boost/algorithm/string/split.hpp>

struct segment
{
    char type;
    int start;
    int length;
};

struct pho_event
{
    std::string code;
    int type;
    int start;
};


struct output_data
{
// for each run
	std::vector<short int> samples;
	std::vector<segment> segments;
	std::vector<pho_event> pho;
	char segment_type = 0;
	int segment_pos = 0;
	float length;
};

struct output_data* current;
struct output_data* last = nullptr;

std::vector<output_data*> runs;

std::map<std::string,std::string> mbrolaMap;

void init()
{
	espeak_ng_InitializePath(NULL);
	
	espeak_ng_ERROR_CONTEXT context = NULL;
	espeak_ng_Initialize(&context);
	
	espeak_ng_InitializeOutput(ENOUTPUT_MODE_SYNCHRONOUS, 0, NULL); //FIX https://notabug.org/isengaara/sekai/issues/4
}

void segment_event(char type)
{
    if(current->segment_type!=type)
    {
        if(current->segment_type!=0)
        {
            segment s;
            s.type = current->segment_type;
            s.start = current->segment_pos;
            s.length = current->samples.size()-current->segment_pos;
            current->segments.push_back(s);
        }
        current->segment_type = type;
        current->segment_pos = current->samples.size();
    }
}

void outputPhoSymbol(char* pho_code,int pho_type)
{
    pho_event p;
    p.code = pho_code;
    p.type = pho_type;
    p.start = current->samples.size();
    current->pho.push_back(p);
	
}

void outputSilence(short int sample)
{
    segment_event('S');
    current->samples.push_back(sample);
}

void outputUnvoiced(short int sample)
{
   segment_event('U');
   current->samples.push_back(sample);   
}

void outputVoiced(short int sample)
{
    segment_event('V');
    current->samples.push_back(sample);
}

void flush()
{
    outputPhoSymbol((char*)"#",0);
    segment_event(0);
}

void write_zstring(FILE* f,std::string s)
{
    fwrite(s.c_str(),1,s.length()+1,f);
}

void write_int(FILE* f,int n)
{
    fwrite(&n,1,sizeof(int),f);
}

void write_short(FILE* f,short n)
{
    fwrite(&n,1,sizeof(short),f);
}

void write_char(FILE* f,char n)
{
    fwrite(&n,1,sizeof(char),f);
}

void write_float(FILE* f,float n)
{
    fwrite(&n,1,sizeof(float),f);
}


void do_synth(int rate,int f0,char* lyric)
{
	// synth with param
	espeak_SetParameter(espeakRATE, rate, 0);
    
    //synth
    espeak_ng_SetConstF0(f0);
	espeak_ng_Synthesize(lyric, 0, 0, POS_CHARACTER, 0, 0, NULL, NULL);
    flush();
}

void show_length(float fs)
{
	float count = current->samples.size();
	
	for (uint i = 0; i<current->pho.size()-1; i++)
	{	
		auto pho = current->pho[i].code;
		auto pos = current->pho[i].start;
		//printf("%s %i\n",pho.c_str(),pos);
		if(pho[0]=='_')
		{
			float length = pos/fs;
			current->length = length;
			return;
		}
		
	}
	
	
	float length = count/fs;
	current->length = length;
}

void find_best_one(float note_length)
{
	bool found = false;
	
	for(size_t i=0; i<runs.size()-1; i++)
	{
		float delta0 = fabs(runs[i]->length - note_length);
		float delta1 = fabs(runs[i+1]->length - note_length);
		//printf("find best one %i %f %f",(int)i,delta0,delta1);
		if(!found && delta1 > delta0)
		{
			//printf(" found");
			current = runs[i];
			found = true;
		}
		//printf("\n");
	}
}


void load_mbrola_table(std::string fileName) {
  std::ifstream infile(fileName);

  std::string line;
  while (std::getline(infile, line)) {
      std::vector<std::string> spl;
      boost::split(spl, line, boost::is_any_of("\t "),
                   boost::token_compress_on);   
        mbrolaMap[spl[0]] = spl[1];      
  }

}    

int main(int argc,char** argv)
{
    if(argc<6) 
    {
        printf("usage: espeak-sg voice f0 lyric rate filename [optargs..]\n");
        return 0;
    }
    
    char* voice    =      argv[1];
    //printf("espeak::voice=%s\n",voice);
    int f0         = atoi(argv[2]);
    char* lyric    =      argv[3];
    int rate       = atoi(argv[4]);
    char* filename =      argv[5];
    
    bool have_note_length = false;
    float note_length = 0;
    
    bool mbrola = false;
    
    for(int i=6;i<argc;i++)
    {
		char* optarg = argv[i];
		if(strlen(optarg)>=4)
		{
			if(optarg[0]=='n' && optarg[1]=='l' && optarg[2]=='=')
			{
				note_length = boost::lexical_cast<float>(optarg+3);
				have_note_length = true;
			}
			if(optarg[0]=='m' && optarg[1]=='b' && optarg[2]=='=')
			{
				mbrola=true;
				load_mbrola_table(optarg+3);
			}
		}
	}
	
	
	init();
	espeak_ng_SetVoiceByName(voice);
	
	
	espeak_ng_OUTPUT_HOOKS hooks;
	hooks.outputPhoSymbol = outputPhoSymbol;
	hooks.outputSilence = outputSilence;
	hooks.outputUnvoiced = outputUnvoiced;
	hooks.outputVoiced = outputVoiced;
	espeak_ng_SetOutputHooks(&hooks);
	
	float samplerate = (float)espeak_ng_GetSampleRate();
	
	if(rate!=0)
	{
		current = new output_data;
		do_synth(rate,f0,lyric);
	}
	else
	{
		if(!have_note_length)
		{
			fprintf(stderr,"note length required\n");
			return 1;
		}
		#if 0
		SF_INFO info;
		info.samplerate = espeak_ng_GetSampleRate();
		info.channels = 1;
		info.format = SF_FORMAT_WAV | SF_FORMAT_PCM_16;
		info.sections = 0;
		info.frames = 0;
		info.seekable = 0;
		SNDFILE* sf = sf_open("/tmp/debug.wav",SFM_WRITE,&info);
		#endif
		
		for(int current_rate = 80; current_rate < 450; current_rate += 5)
		{
			//printf("set rate %i\n",current_rate);
			current = new output_data;
			do_synth(current_rate,f0,lyric);
			show_length(samplerate);
			runs.push_back(current);
			last = current;
		}
		
		find_best_one(note_length);
		
		#if 0
		sf_write_short(sf,current->samples.data(),current->samples.size());
		sf_close(sf);
		#endif
	}
	
	if(mbrola)
	{
		FILE* f = fopen(filename,"w");
		///if(have_note_length) fprintf(f,"#nl=%f\n",note_length); 
		for (uint i = 0; i<current->pho.size()-1; i++)
		{
			
			auto pho = current->pho[i].code;
			if(pho[0]=='_') break;
			if (mbrolaMap.count(pho))
			{
				pho = mbrolaMap[pho];
			}
			
			float t = (current->pho[i+1].start + current->pho[i].start) * 1.0 / samplerate;
			char* ototypes[]= {"PAUSE","STRESS","VOWEL","LIQUID","STOP","VSTOP","FRICATIVE","VFRICATIVE","NASAL","VIRTUAL","DELETED","INVALID"};
			fprintf(f,"%s %s %f",pho.c_str(),ototypes[current->pho[i].type],t);
			if(last)
			{
				float t_min = (last->pho[i+1].start + last->pho[i].start) * 1.0 / samplerate;
				fprintf(f," %f",t_min);
			}
			fprintf(f,"\n");
		}
		
		fclose(f);
		return 0;
	}
	
    FILE* f = fopen(filename,"w");
    write_zstring(f,"espeak-sg");
    write_int(f,0); //version of the file format
    write_int(f,samplerate);
    write_int(f,f0);
    
    
    write_int(f,current->pho.size());
    for (auto i : current->pho)
    {
        write_zstring(f,i.code);
        write_int(f,i.type);
        write_int(f,i.start);
    }
    
    write_int(f,current->segments.size());
    for (auto i : current->segments)
    {
        write_char(f, i.type);
        write_int(f, i.start);
        write_int(f,  i.length);
    }
    
    write_int(f,current->samples.size());
    fwrite(current->samples.data(),current->samples.size(),sizeof(short),f);
    
    fclose(f); 
    
    return 0;   
}