necklace
/
sekai
Fork de isengaara/sekai


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
							#include "sekai/UTAUSynth.h"
#include "sekai/Track.h"

#include <string.h>
#include <fstream>
#include <iostream>
#include <sstream>

#include <json/json.h>
#include <math.h>
#include <sndfile.h>
#include <boost/filesystem.hpp>

#define KEY(s) static const std::string key_##s = #s;
KEY(samplerate)
KEY(fft_size)
KEY(frame_period)

class VoiceDefUTAU : public VoiceDef {
 public:
  float *_input_data;
  int _input_data_length;
  float _input_length_oto;
  virtual float getLength() {return _input_length_oto;}
  virtual void getImpulseResponse(float pos, float *impulseResponse, int *impulseResponseLength) {}
  virtual int getSamplerate(){return 0;}
  virtual std::string getPhoLine(int index){return "";}
};

void readConfig(const std::string &fileName, int *samplerate, int *fft_size,
                float *frame_period) {
  Json::Value root;
  std::ifstream file(fileName);
  file >> root;

  if (root.isMember(key_samplerate) && root[key_samplerate].isInt()) {
    *samplerate = root[key_samplerate].asInt();
  }
  if (root.isMember(key_fft_size) && root[key_fft_size].isInt()) {
    *fft_size = root[key_fft_size].asInt();
  }
  if (root.isMember(key_frame_period) && root[key_frame_period].isNumeric()) {
    *frame_period = root[key_frame_period].asFloat();
  }
}

UTAUSynth::UTAUSynth(std::string utauPath, PitchModel *pitch, int buffer_size)
    : VoiceSampler(buffer_size) {
  _utauPath = utauPath;
  _impulseResponse = new float[IMPULSE_RESPONSE_MAX];
  _pitch = pitch;

  _fft_size = 2048;
  _frame_period = 5.0;
  _synthext = ".ogg";
  readConfig(_utauPath + "/oto.json", &_samplerate, &_fft_size, &_frame_period);
}

void UTAUSynth::addUnit(const std::string &lyric, int count, float *a,
                        float *b) {
  std::string path;

  path = _utauPath + "/" + lyric;

  if (_voicemap[path] == nullptr) {
    //_voicemap[path] = load(path);
    load(path);  // this may fail -> return false
  }

  PhoEvent *e = new PhoEvent;
  e->points = count;
  e->voice = _voicemap[path];

  VoiceDefUTAU *voice = (VoiceDefUTAU *)e->voice;
  float length = voice->_input_length_oto;

  for (int i = 0; i < count; i++) {
    e->x[i] = a[i];
    if (b[i] < 0)
      e->y[i] = length - b[i];
    else
      e->y[i] = b[i];
  }
  _phoEvents.addEvent(e);
}

bool UTAUSynth::addOnePulse() {
  float currentTime = inputPositionSamples() / _samplerate;
  _phoEvents.selectNext(currentTime);

  PhoEvent *pho0 = _phoEvents.current();
  PhoEvent *pho1 = _phoEvents.next();

  if (pho0 == nullptr) return false;

  float output_f0 = _pitch->getF0atTime(currentTime);
  if (output_f0 == 0) output_f0 = 50;

  if (currentTime < pho0->start()) {
    // rest: output silence
    output_f0 = 500;
    float period = _samplerate * 1.0f / output_f0;
    float dummy;
    ola(&dummy, 0, period);
    return true;
  }

  if (currentTime >= pho0->start() && currentTime < pho0->end()) {
    float interp = 0;
    int impulseResponseLength = 0;

    getImpulseResponse(currentTime, pho0, _impulseResponse,
                       &impulseResponseLength,
                       0);  // TODO get impulse response from mapped index

    if (pho1 && currentTime >= pho1->start()) {
      interp = (currentTime - pho1->start()) / (pho0->end() - pho1->start());
    }

    if (interp > 0) {
      getImpulseResponse(currentTime, pho1, _impulseResponse,
                         &impulseResponseLength,
                         interp);  // needs interp as input
    }

    // int tmp = static_cast<int>(_samplerate * 1.0f / output_f0);
    // float period = tmp;
    float period = _samplerate * 1.0f / output_f0;

    VoiceSampler::hanningWindow(_impulseResponse, impulseResponseLength);

    ola(_impulseResponse, impulseResponseLength, period);
    return true;
  }
  return false;
}

void UTAUSynth::load(std::string fileName) {
#if 0
  std::string f0 =
      boost::filesystem::change_extension(fileName, ".f0").string();
  std::string pmk =
      boost::filesystem::change_extension(fileName, ".pmk").string();

  rec->f0Track.readFromFile(f0);
  rec->pmkTrack.readFromFile(pmk);
#endif

  std::string fileName1 = fileName + ".wav";
  std::string fileName2 = fileName + ".ogg";

  SF_INFO info = {0};
  SNDFILE *sf = sf_open(fileName2.c_str(), SFM_READ, &info);
  if (sf == nullptr) {
    return;  // false
  }

  float *input_data = new float[info.frames];
  int input_data_length = info.frames;
  assert(info.channels == 1);
  if (_samplerate == 0)
    _samplerate = info.samplerate;
  else
    assert(_samplerate == info.samplerate);  // return false if mismatch

  sf_read_float(sf, input_data, info.frames);
  sf_close(sf);

  sf = sf_open(fileName1.c_str(), SFM_READ, &info);
  assert(info.channels == 1);
  // store length rec->input_data_length_orig = info.frames;
  sf_close(sf);

  VoiceDefUTAU *voice = new VoiceDefUTAU();
  voice->_input_data = input_data;
  voice->_input_data_length = input_data_length;
  voice->_input_length_oto = info.frames * 1.0 / _samplerate;
  _voicemap[fileName] = voice;
}

void UTAUSynth::getImpulseResponse(float currentTime, PhoEvent *event,
                                   float *impulseResponse,
                                   int *impulseResponseLength, float morph) {
  VoiceDefUTAU *voice = (VoiceDefUTAU *)event->voice;

  float localTime =
      interp_linear(event->x, event->y, event->points, currentTime);

  *impulseResponseLength = _fft_size;
  int frame_index = localTime * 1000 / _frame_period;
  int frame_offset = frame_index * _fft_size;
  int posL = frame_offset;
  int posR = frame_offset + _fft_size;
  float pmk_interp = 0;

  if (posL < 0) posL = 0;
  if (posR < 0) posR = 0;

  for (int i = 0; i < *impulseResponseLength; i++) {
    float l = 0;
    float r = 0;
    float x = 0;

    if (i + posL < voice->_input_data_length) l = voice->_input_data[i + posL];
    if (i + posR < voice->_input_data_length) r = voice->_input_data[i + posR];
    x = r * (1 - pmk_interp) + l * pmk_interp;

    if (morph) {
      r = x;
      l = impulseResponse[i];
      x = r * morph + l * (1.0f - morph);
    }

    impulseResponse[i] = x;
  }
}

// TODO: remove later
void UTAUSynth::addPitchPointsForNote(float notepos, float length, float f0) {
  _pitch->addNote(notepos, length, f0);
}
void UTAUSynth::addPitchPointsForRest(float notepos, float length) {}
void UTAUSynth::fix() { _pitch->fix(); }
float UTAUSynth::getLengthForUnit(const std::string &fileName) { return 0; }