123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225 |
- #include "sekai/UTAUSynth.h"
- #include "sekai/Track.h"
- #include <string.h>
- #include <fstream>
- #include <iostream>
- #include <sstream>
- #include <json/json.h>
- #include <math.h>
- #include <sndfile.h>
- #include <boost/filesystem.hpp>
- #define KEY(s) static const std::string key_##s = #s;
- KEY(samplerate)
- KEY(fft_size)
- KEY(frame_period)
- class VoiceDefUTAU : public VoiceDef {
- public:
- float *_input_data;
- int _input_data_length;
- float _input_length_oto;
- virtual float getLength() {return _input_length_oto;}
- virtual void getImpulseResponse(float pos, float *impulseResponse, int *impulseResponseLength) {}
- virtual int getSamplerate(){return 0;}
- virtual std::string getPhoLine(int index){return "";}
- };
- void readConfig(const std::string &fileName, int *samplerate, int *fft_size,
- float *frame_period) {
- Json::Value root;
- std::ifstream file(fileName);
- file >> root;
- if (root.isMember(key_samplerate) && root[key_samplerate].isInt()) {
- *samplerate = root[key_samplerate].asInt();
- }
- if (root.isMember(key_fft_size) && root[key_fft_size].isInt()) {
- *fft_size = root[key_fft_size].asInt();
- }
- if (root.isMember(key_frame_period) && root[key_frame_period].isNumeric()) {
- *frame_period = root[key_frame_period].asFloat();
- }
- }
- UTAUSynth::UTAUSynth(std::string utauPath, PitchModel *pitch, int buffer_size)
- : VoiceSampler(buffer_size) {
- _utauPath = utauPath;
- _impulseResponse = new float[IMPULSE_RESPONSE_MAX];
- _pitch = pitch;
- _fft_size = 2048;
- _frame_period = 5.0;
- _synthext = ".ogg";
- readConfig(_utauPath + "/oto.json", &_samplerate, &_fft_size, &_frame_period);
- }
- void UTAUSynth::addUnit(const std::string &lyric, int count, float *a,
- float *b) {
- std::string path;
- path = _utauPath + "/" + lyric;
- if (_voicemap[path] == nullptr) {
- //_voicemap[path] = load(path);
- load(path); // this may fail -> return false
- }
- PhoEvent *e = new PhoEvent;
- e->points = count;
- e->voice = _voicemap[path];
- VoiceDefUTAU *voice = (VoiceDefUTAU *)e->voice;
- float length = voice->_input_length_oto;
- for (int i = 0; i < count; i++) {
- e->x[i] = a[i];
- if (b[i] < 0)
- e->y[i] = length - b[i];
- else
- e->y[i] = b[i];
- }
- _phoEvents.addEvent(e);
- }
- bool UTAUSynth::addOnePulse() {
- float currentTime = inputPositionSamples() / _samplerate;
- _phoEvents.selectNext(currentTime);
- PhoEvent *pho0 = _phoEvents.current();
- PhoEvent *pho1 = _phoEvents.next();
- if (pho0 == nullptr) return false;
- float output_f0 = _pitch->getF0atTime(currentTime);
- if (output_f0 == 0) output_f0 = 50;
- if (currentTime < pho0->start()) {
- // rest: output silence
- output_f0 = 500;
- float period = _samplerate * 1.0f / output_f0;
- float dummy;
- ola(&dummy, 0, period);
- return true;
- }
- if (currentTime >= pho0->start() && currentTime < pho0->end()) {
- float interp = 0;
- int impulseResponseLength = 0;
- getImpulseResponse(currentTime, pho0, _impulseResponse,
- &impulseResponseLength,
- 0); // TODO get impulse response from mapped index
- if (pho1 && currentTime >= pho1->start()) {
- interp = (currentTime - pho1->start()) / (pho0->end() - pho1->start());
- }
- if (interp > 0) {
- getImpulseResponse(currentTime, pho1, _impulseResponse,
- &impulseResponseLength,
- interp); // needs interp as input
- }
- // int tmp = static_cast<int>(_samplerate * 1.0f / output_f0);
- // float period = tmp;
- float period = _samplerate * 1.0f / output_f0;
- VoiceSampler::hanningWindow(_impulseResponse, impulseResponseLength);
- ola(_impulseResponse, impulseResponseLength, period);
- return true;
- }
- return false;
- }
- void UTAUSynth::load(std::string fileName) {
- #if 0
- std::string f0 =
- boost::filesystem::change_extension(fileName, ".f0").string();
- std::string pmk =
- boost::filesystem::change_extension(fileName, ".pmk").string();
- rec->f0Track.readFromFile(f0);
- rec->pmkTrack.readFromFile(pmk);
- #endif
- std::string fileName1 = fileName + ".wav";
- std::string fileName2 = fileName + ".ogg";
- SF_INFO info = {0};
- SNDFILE *sf = sf_open(fileName2.c_str(), SFM_READ, &info);
- if (sf == nullptr) {
- return; // false
- }
- float *input_data = new float[info.frames];
- int input_data_length = info.frames;
- assert(info.channels == 1);
- if (_samplerate == 0)
- _samplerate = info.samplerate;
- else
- assert(_samplerate == info.samplerate); // return false if mismatch
- sf_read_float(sf, input_data, info.frames);
- sf_close(sf);
- sf = sf_open(fileName1.c_str(), SFM_READ, &info);
- assert(info.channels == 1);
- // store length rec->input_data_length_orig = info.frames;
- sf_close(sf);
- VoiceDefUTAU *voice = new VoiceDefUTAU();
- voice->_input_data = input_data;
- voice->_input_data_length = input_data_length;
- voice->_input_length_oto = info.frames * 1.0 / _samplerate;
- _voicemap[fileName] = voice;
- }
- void UTAUSynth::getImpulseResponse(float currentTime, PhoEvent *event,
- float *impulseResponse,
- int *impulseResponseLength, float morph) {
- VoiceDefUTAU *voice = (VoiceDefUTAU *)event->voice;
- float localTime =
- interp_linear(event->x, event->y, event->points, currentTime);
- *impulseResponseLength = _fft_size;
- int frame_index = localTime * 1000 / _frame_period;
- int frame_offset = frame_index * _fft_size;
- int posL = frame_offset;
- int posR = frame_offset + _fft_size;
- float pmk_interp = 0;
- if (posL < 0) posL = 0;
- if (posR < 0) posR = 0;
- for (int i = 0; i < *impulseResponseLength; i++) {
- float l = 0;
- float r = 0;
- float x = 0;
- if (i + posL < voice->_input_data_length) l = voice->_input_data[i + posL];
- if (i + posR < voice->_input_data_length) r = voice->_input_data[i + posR];
- x = r * (1 - pmk_interp) + l * pmk_interp;
- if (morph) {
- r = x;
- l = impulseResponse[i];
- x = r * morph + l * (1.0f - morph);
- }
- impulseResponse[i] = x;
- }
- }
- // TODO: remove later
- void UTAUSynth::addPitchPointsForNote(float notepos, float length, float f0) {
- _pitch->addNote(notepos, length, f0);
- }
- void UTAUSynth::addPitchPointsForRest(float notepos, float length) {}
- void UTAUSynth::fix() { _pitch->fix(); }
- float UTAUSynth::getLengthForUnit(const std::string &fileName) { return 0; }
|