Ejemplo n.º 1
0
/* HTS_Engine_generate_state_sequence: genereate state sequence (1st synthesis step) */
static HTS_Boolean HTS_Engine_generate_state_sequence(HTS_Engine * engine)
{
   size_t i, state_index, model_index;
   double f;

   if (HTS_SStreamSet_create(&engine->sss, &engine->ms, &engine->label, engine->condition.phoneme_alignment_flag, engine->condition.speed, engine->condition.duration_iw, engine->condition.parameter_iw, engine->condition.gv_iw) != TRUE) {
      HTS_Engine_refresh(engine);
      return FALSE;
   }
   if (engine->condition.additional_half_tone != 0.0) {
      state_index = 0;
      model_index = 0;
      for (i = 0; i < HTS_Engine_get_total_state(engine); i++) {
         f = HTS_Engine_get_state_mean(engine, 1, i, 0);
         f += engine->condition.additional_half_tone * HALF_TONE;
         if (f < MIN_LF0)
            f = MIN_LF0;
         else if (f > MAX_LF0)
            f = MAX_LF0;
         HTS_Engine_set_state_mean(engine, 1, i, 0, f);
         state_index++;
         if (state_index >= HTS_Engine_get_nstate(engine)) {
            state_index = 0;
            model_index++;
         }
      }
   }
   return TRUE;
}
Ejemplo n.º 2
0
void TextToSpeech::synthesis(char *txt, FILE * wavfp)
{
	char buff[MAXBUFLEN];

	text2mecab(buff, txt);
	Mecab_analysis(&open_jtalk_.mecab, buff);
	mecab2njd(&open_jtalk_.njd, Mecab_get_feature(&open_jtalk_.mecab), Mecab_get_size(&open_jtalk_.mecab));
	njd_set_pronunciation(&open_jtalk_.njd);
	njd_set_digit(&open_jtalk_.njd);
	njd_set_accent_phrase(&open_jtalk_.njd);
	njd_set_accent_type(&open_jtalk_.njd);
	njd_set_unvoiced_vowel(&open_jtalk_.njd);
	njd_set_long_vowel(&open_jtalk_.njd);
	njd2jpcommon(&open_jtalk_.jpcommon, &open_jtalk_.njd);
	JPCommon_make_label(&open_jtalk_.jpcommon);
	if (JPCommon_get_label_size(&open_jtalk_.jpcommon) > 2) {
		HTS_Engine_load_label_from_string_list(
			&open_jtalk_.engine,
			JPCommon_get_label_feature(&open_jtalk_.jpcommon),
			JPCommon_get_label_size(&open_jtalk_.jpcommon)
		);
		HTS_Engine_create_sstream(&open_jtalk_.engine);
		HTS_Engine_create_pstream(&open_jtalk_.engine);
		HTS_Engine_create_gstream(&open_jtalk_.engine);
		if (wavfp != NULL)
			HTS_Engine_save_riff(&open_jtalk_.engine, wavfp);
		HTS_Engine_refresh(&open_jtalk_.engine);
	}
	JPCommon_refresh(&open_jtalk_.jpcommon);
	NJD_refresh(&open_jtalk_.njd);
	Mecab_refresh(&open_jtalk_.mecab);
}
Ejemplo n.º 3
0
/* HTS_Engine_synthesize: synthesize speech */
static HTS_Boolean HTS_Engine_synthesize(HTS_Engine * engine)
{
   if (HTS_Engine_generate_state_sequence(engine) != TRUE) {
      HTS_Engine_refresh(engine);
      return FALSE;
   }
   if (HTS_Engine_generate_parameter_sequence(engine) != TRUE) {
      HTS_Engine_refresh(engine);
      return FALSE;
   }
   if (HTS_Engine_generate_sample_sequence(engine) != TRUE) {
      HTS_Engine_refresh(engine);
      return FALSE;
   }
   return TRUE;
}
Ejemplo n.º 4
0
void TextToSpeech::synthesis(const char *txt)
{
	char buff[MAXBUFLEN];

	text2mecab(buff, txt);
	Mecab_analysis(&open_jtalk_.mecab, buff);
	mecab2njd(&open_jtalk_.njd, Mecab_get_feature(&open_jtalk_.mecab), Mecab_get_size(&open_jtalk_.mecab));
	njd_set_pronunciation(&open_jtalk_.njd);
	njd_set_digit(&open_jtalk_.njd);
	njd_set_accent_phrase(&open_jtalk_.njd);
	njd_set_accent_type(&open_jtalk_.njd);
	njd_set_unvoiced_vowel(&open_jtalk_.njd);
	njd_set_long_vowel(&open_jtalk_.njd);
	njd2jpcommon(&open_jtalk_.jpcommon, &open_jtalk_.njd);
	JPCommon_make_label(&open_jtalk_.jpcommon);
	if (JPCommon_get_label_size(&open_jtalk_.jpcommon) > 2) {
		unsigned int pcm_len;
		HTS_Engine_load_label_from_string_list(
			&open_jtalk_.engine,
			JPCommon_get_label_feature(&open_jtalk_.jpcommon),
			JPCommon_get_label_size(&open_jtalk_.jpcommon)
		);
		HTS_Engine_create_sstream(&open_jtalk_.engine);
		HTS_Engine_create_pstream(&open_jtalk_.engine);
		HTS_Engine_create_gstream(&open_jtalk_.engine);
		pcm_len = HTS_Engine_get_generated_speech_size(&open_jtalk_.engine);
		pcm_ = new short[pcm_len];
		HTS_Engine_get_generated_speech(&open_jtalk_.engine, pcm_);
		play_write(play_h_, pcm_, pcm_len * sizeof(short));
		HTS_Engine_refresh(&open_jtalk_.engine);
	}
	JPCommon_refresh(&open_jtalk_.jpcommon);
	NJD_refresh(&open_jtalk_.njd);
	Mecab_refresh(&open_jtalk_.mecab);
}
Ejemplo n.º 5
0
/* Flite_HTS_Engine_synthesize: synthesize speech */
HTS_Boolean Flite_HTS_Engine_synthesize(Flite_HTS_Engine * f, const char *txt, const char *wav)
{
   int i;
   FILE *fp;
   cst_voice *v = NULL;
   cst_utterance *u = NULL;
   cst_item *s = NULL;
   char **label_data = NULL;
   int label_size = 0;

   if (txt == NULL)
      return FALSE;

   /* text analysis part */
   v = REGISTER_VOX(NULL);
   if (v == NULL)
      return FALSE;
   u = flite_synth_text(txt, v);
   if (u == NULL) {
      UNREGISTER_VOX(v);
      return FALSE;
   }
   for (s = relation_head(utt_relation(u, "Segment")); s; s = item_next(s))
      label_size++;
   if (label_size <= 0) {
      delete_utterance(u);
      UNREGISTER_VOX(v);
      return FALSE;
   }
   label_data = (char **) calloc(label_size, sizeof(char *));
   for (i = 0, s = relation_head(utt_relation(u, "Segment")); s; s = item_next(s), i++) {
      label_data[i] = (char *) calloc(MAXBUFLEN, sizeof(char));
      Flite_HTS_Engine_create_label(f, s, label_data[i]);
   }

   /* speech synthesis part */
   HTS_Engine_synthesize_from_strings(&f->engine, label_data, label_size);
   if (wav != NULL) {
      fp = fopen(wav, "wb");
      HTS_Engine_save_riff(&f->engine, fp);
      fclose(fp);
   }
   HTS_Engine_refresh(&f->engine);

   for (i = 0; i < label_size; i++)
      free(label_data[i]);
   free(label_data);

   delete_utterance(u);
   UNREGISTER_VOX(v);

   return TRUE;
}
Ejemplo n.º 6
0
int OpenJTalk_synthesis_towav(OpenJTalk** openjtalk,const char* text, const char* wavfilename)
{
   char buff[MAXBUFLEN];
   FILE * wavfp;
   wavfp = fopen(wavfilename,"wb");
   if (!wavfp)
   {
       sprintf((*openjtalk)->errorout,"can not open %s.",wavfilename);
       return 0;
   }

   text2mecab(buff, (char*)text);
   Mecab_analysis((*openjtalk)->mecab, buff);
   mecab2njd(&(*openjtalk)->njd, Mecab_get_feature((*openjtalk)->mecab),
             Mecab_get_size((*openjtalk)->mecab));
   njd_set_pronunciation(&(*openjtalk)->njd);
   njd_set_digit(&(*openjtalk)->njd);
   njd_set_accent_phrase(&(*openjtalk)->njd);
   njd_set_accent_type(&(*openjtalk)->njd);
   njd_set_unvoiced_vowel(&(*openjtalk)->njd);
   njd_set_long_vowel(&(*openjtalk)->njd);
   njd2jpcommon(&(*openjtalk)->jpcommon, &(*openjtalk)->njd);
   JPCommon_make_label(&(*openjtalk)->jpcommon);
   if (JPCommon_get_label_size(&(*openjtalk)->jpcommon) > 2) {
      HTS_Engine_load_label_from_string_list(&(*openjtalk)->engine,
                                             JPCommon_get_label_feature(&(*openjtalk)->jpcommon),
                                             JPCommon_get_label_size(&(*openjtalk)->jpcommon));
      HTS_Engine_create_sstream(&(*openjtalk)->engine);
      HTS_Engine_create_pstream(&(*openjtalk)->engine);
      HTS_Engine_create_gstream(&(*openjtalk)->engine);

      HTS_Engine_save_riff(&(*openjtalk)->engine, wavfp);
/*
      if (wavfp != NULL)
         HTS_Engine_save_riff(&(*openjtalk)->engine, wavfp);
      if (logfp != NULL) {
         fprintf(logfp, "[Text analysis result]\n");
         NJD_fprint(&(*openjtalk)->njd, logfp);
         fprintf(logfp, "\n[Output label]\n");
         HTS_Engine_save_label(&(*openjtalk)->engine, logfp);
         fprintf(logfp, "\n");
         HTS_Engine_save_information(&(*openjtalk)->engine, logfp);
      }
*/
      HTS_Engine_refresh(&(*openjtalk)->engine);
   }
   JPCommon_refresh(&(*openjtalk)->jpcommon);
   NJD_refresh(&(*openjtalk)->njd);
   Mecab_refresh((*openjtalk)->mecab);

   fclose(wavfp);
   return 1;
}
Ejemplo n.º 7
0
/* HTS_Engine_synthesize_from_strings: synthesize speech from strings */
HTS_Boolean HTS_Engine_synthesize_from_strings(HTS_Engine * engine, char **lines, size_t num_lines)
{
   size_t i;
   double f;

   HTS_Engine_refresh(engine);
// Generate state sequence
   if (HTS_SStreamSet_create(&engine->sss, &engine->ms, lines, num_lines, engine->condition.speed) != TRUE) {
      HTS_Engine_refresh(engine);
      return FALSE;
   }
   if (engine->condition.additional_half_tone != 0.0) {
      for (i = 0; i < HTS_SStreamSet_get_total_state(&engine->sss); i++) {
         f = HTS_SStreamSet_get_mean(&engine->sss, 1, i, 0);
         f += engine->condition.additional_half_tone * HALF_TONE;
         if (f < MIN_LF0)
            f = MIN_LF0;
         else if (f > MAX_LF0)
            f = MAX_LF0;
         HTS_SStreamSet_set_mean(&engine->sss, 1, i, 0, f);
      }
   }
// Generate parameter sequence
   if (HTS_PStreamSet_create(&engine->pss, &engine->sss, engine->condition.msd_threshold) != TRUE) {
      HTS_Engine_refresh(engine);
      return FALSE;
   }

// Free state sequence
   HTS_SStreamSet_clear(&engine->sss);

// Generate sound sample sequence
   if (HTS_GStreamSet_create(&engine->gss, &engine->pss, engine->condition.sampling_frequency,
                             engine->condition.fperiod, engine->condition.alpha,
                             engine->condition.beta) != TRUE) {
      HTS_Engine_refresh(engine);
      return FALSE;
   }
   return TRUE;
}
Ejemplo n.º 8
0
static int synthesize(struct app *app, char *txt)
{
	char buff[MAXBUFLEN];
	int label_size;
	int r = -1;

	text2mecab(buff, txt);
	Mecab_analysis(&app->mecab, buff);
	mecab2njd(&app->njd, Mecab_get_feature(&app->mecab),
		  Mecab_get_size(&app->mecab));
	njd_set_pronunciation(&app->njd);
	njd_set_digit(&app->njd);
	njd_set_accent_phrase(&app->njd);
	njd_set_accent_type(&app->njd);
	njd_set_unvoiced_vowel(&app->njd);
	njd_set_long_vowel(&app->njd);
	njd2jpcommon(&app->jpcommon, &app->njd);
	JPCommon_make_label(&app->jpcommon);
	label_size = JPCommon_get_label_size(&app->jpcommon);
	if (label_size > 2) {
		if (HTS_Engine_synthesize_from_strings(
				&app->engine,
				JPCommon_get_label_feature(&app->jpcommon),
				label_size) == TRUE) {
			unsigned int pcm_len;
			r = 0;	/* success */
			pcm_len = HTS_Engine_get_generated_speech_size(
					&app->engine);
			app->pcm = malloc(pcm_len * sizeof(short));
			HTS_Engine_get_generated_speech(&app->engine, app->pcm);
			play_write(app->play_h, app->pcm,
				   pcm_len * sizeof(short));
		}

		if (app->logfp) {
			fprintf(app->logfp, "[Text analysis result]\n");
			NJD_fprint(&app->njd, app->logfp);
			fprintf(app->logfp, "\n[Output label]\n");
			HTS_Engine_save_label(&app->engine, app->logfp);
			fprintf(app->logfp, "\n");
			HTS_Engine_save_information(&app->engine, app->logfp);
		}
		HTS_Engine_refresh(&app->engine);
	}
	JPCommon_refresh(&app->jpcommon);
	NJD_refresh(&app->njd);
	Mecab_refresh(&app->mecab);

	return r;
}
/* Flite_HTS_Engine_synthesis: speech synthesis */
void Flite_HTS_Engine_synthesis(Flite_HTS_Engine * f, char *txt, FILE * wavfp)
{
    int i;
    cst_voice *v = NULL;
    cst_utterance *u = NULL;
    cst_item *s = NULL;
    char **label_data = NULL;
    int label_size = 0;

    /* text analysis part */
    v = REGISTER_VOX(NULL);
    if (v == NULL)
        return;
    u = flite_synth_text(txt, v);
    if (u == NULL)
        return;
    for (s = relation_head(utt_relation(u, "Segment")); s; s = item_next(s))
        label_size++;
    if (label_size <= 0)
        return;
    label_data = (char **) calloc(label_size, sizeof(char *));
    for (i = 0, s = relation_head(utt_relation(u, "Segment")); s;
            s = item_next(s), i++) {
        label_data[i] = (char *) calloc(MAXBUFLEN, sizeof(char));
        Flite_HTS_Engine_create_label(f, s, label_data[i]);
    }

    /* speech synthesis part */
    HTS_Engine_load_label_from_string_list(&f->engine, label_data, label_size);
    HTS_Engine_create_sstream(&f->engine);
    HTS_Engine_create_pstream(&f->engine);
    HTS_Engine_create_gstream(&f->engine);
    if (wavfp != NULL)
        HTS_Engine_save_riff(&f->engine, wavfp);

    HTS_Engine_refresh(&f->engine);

    for (i = 0; i < label_size; i++)
        free(label_data[i]);
    free(label_data);

    delete_utterance(u);
    UNREGISTER_VOX(v);
}
Ejemplo n.º 10
0
static int Open_JTalk_synthesis(Open_JTalk * open_jtalk, const char *txt, FILE * wavfp,
                                FILE * logfp)
{
   int result = 0;
   char buff[MAXBUFLEN];

   text2mecab(buff, txt);
   Mecab_analysis(&open_jtalk->mecab, buff);
   mecab2njd(&open_jtalk->njd, Mecab_get_feature(&open_jtalk->mecab),
             Mecab_get_size(&open_jtalk->mecab));
   njd_set_pronunciation(&open_jtalk->njd);
   njd_set_digit(&open_jtalk->njd);
   njd_set_accent_phrase(&open_jtalk->njd);
   njd_set_accent_type(&open_jtalk->njd);
   njd_set_unvoiced_vowel(&open_jtalk->njd);
   njd_set_long_vowel(&open_jtalk->njd);
   njd2jpcommon(&open_jtalk->jpcommon, &open_jtalk->njd);
   JPCommon_make_label(&open_jtalk->jpcommon);
   if (JPCommon_get_label_size(&open_jtalk->jpcommon) > 2) {
      if (HTS_Engine_synthesize_from_strings
          (&open_jtalk->engine, JPCommon_get_label_feature(&open_jtalk->jpcommon),
           JPCommon_get_label_size(&open_jtalk->jpcommon)) == TRUE)
         result = 1;
      if (wavfp != NULL)
         HTS_Engine_save_riff(&open_jtalk->engine, wavfp);
      if (logfp != NULL) {
         fprintf(logfp, "[Text analysis result]\n");
         NJD_fprint(&open_jtalk->njd, logfp);
         fprintf(logfp, "\n[Output label]\n");
         HTS_Engine_save_label(&open_jtalk->engine, logfp);
         fprintf(logfp, "\n");
         HTS_Engine_save_information(&open_jtalk->engine, logfp);
      }
      HTS_Engine_refresh(&open_jtalk->engine);
   }
   JPCommon_refresh(&open_jtalk->jpcommon);
   NJD_refresh(&open_jtalk->njd);
   Mecab_refresh(&open_jtalk->mecab);

   return result;
}
Ejemplo n.º 11
0
int main(int argc, char* argv[]) {
    size_t num_voices;
    char **fn_voices;
    char* in_fname;
    char* output_fname;
    FILE * outfp;
    char* dur_fname;
    FILE * durfp;    
    bool print_label = false;
    bool print_utt = false;
    bool write_raw = false;
    bool write_durlabel = false;

    CFSAString LexFileName, LexDFileName;
    HTS_Engine engine;
    double speed = 1.1;
    size_t fr = 48000;
    size_t fp = 240;
    float alpha = 0.55;
    float beta = 0.0;
    float ht = 2.0;
    float th = 0.5;
    float gvw1 = 1.0;
    float gvw2 = 1.2;

    FSCInit();
    fn_voices = (char **) malloc(argc * sizeof (char *));
    
    if (argc < 11) {
        fprintf(stderr, "Viga: liiga vähe parameetreid\n\n");
        PrintUsage();
    }    

    for (int i = 0; i < argc; i++) {
        if (CFSAString("-lex") == argv[i]) {
            if (i + 1 < argc) {
                LexFileName = argv[++i];
            } else {
                return PrintUsage();
            }
        }
        if (CFSAString("-lexd") == argv[i]) {
            if (i + 1 < argc) {
                LexDFileName = argv[++i];
            } else {
                return PrintUsage();
            }
        }
        if (CFSAString("-m") == argv[i]) {
            if (i + 1 < argc) {
                fn_voices[0] = argv[i + 1];
            } else {
                fprintf(stderr, "Viga: puudub *.htsvoice fail\n");
                PrintUsage();
                exit(0);
            }
        }
        if (CFSAString("-o") == argv[i]) {
            if (i + 1 < argc) {
                output_fname = argv[i + 1];
                cfileexists(output_fname);
            } else {
                fprintf(stderr, "Viga: puudb väljundfaili nimi\n");
                PrintUsage();
                exit(0);
            }
        }
        if (CFSAString("-f") == argv[i]) {
            if (i + 1 < argc) {
                in_fname = argv[i + 1];
            } else {
                fprintf(stderr, "Viga: puudb sisendfaili nimi\n");
                PrintUsage();
                exit(0);
            }
        }
        if (CFSAString("-s") == argv[i]) {
            if (i + 1 < argc) {
                samplerate(fr, fp, alpha, atoi(argv[i + 1]));
            }
        }
        if (CFSAString("-r") == argv[i]) {
            if (i + 1 < argc) {
                speed = atof(argv[i + 1]);
            }
        }
        if (CFSAString("-ht") == argv[i]) {
            if (i + 1 < argc) {
                ht = atof(argv[i + 1]);
            }
        }
        if (CFSAString("-gvw1") == argv[i]) {
            if (i + 1 < argc) {
                gvw1 = atof(argv[i + 1]);
            }
        }
        if (CFSAString("-gvw2") == argv[i]) {
            if (i + 1 < argc) {
                gvw2 = atof(argv[i + 1]);
            }
        }        
        if (CFSAString("-debug") == argv[i]) {
            print_label = true;
        }
        if (CFSAString("-utt") == argv[i]) {
            print_utt = true;
        }        
        if (CFSAString("-raw") == argv[i]) {
            write_raw = true;
        }
        if (CFSAString("-dur") == argv[i]) {
            if (i + 1 < argc) {
                dur_fname = argv[i + 1];
                cfileexists(dur_fname);
                write_durlabel = true;                
            } else {
                fprintf(stderr, "Viga: puudb kestustefaili nimi\n");
                PrintUsage();
                exit(0);
            }
        }

        
    }

    Linguistic.Open(LexFileName);
    Disambiguator.Open(LexDFileName);

    CFSWString text;
    ReadUTF8Text(text, in_fname);
    HTS_Engine_initialize(&engine);

    if (HTS_Engine_load(&engine, fn_voices, 1) != TRUE) {
        fprintf(stderr, "Viga: puudub *.htsvoice. %p\n", fn_voices[0]);
        free(fn_voices);
        HTS_Engine_clear(&engine);
        exit(1);
    }
    free(fn_voices);

    HTS_Engine_set_sampling_frequency(&engine, (size_t) fr);
    HTS_Engine_set_phoneme_alignment_flag(&engine, FALSE);
    HTS_Engine_set_fperiod(&engine, (size_t) fp);
    HTS_Engine_set_alpha(&engine, alpha);
    HTS_Engine_set_beta(&engine, beta);
    HTS_Engine_set_speed(&engine, speed);
    HTS_Engine_add_half_tone(&engine, ht);
    HTS_Engine_set_msd_threshold(&engine, 1, th);
    /*
    HTS_Engine_set_duration_interpolation_weight(&engine, 1, diw);
    HTS_Engine_set_parameter_interpolation_weight(&engine, 0, 0, piw1);
    HTS_Engine_set_parameter_interpolation_weight(&engine, 0, 1, piw2);
    HTS_Engine_set_gv_interpolation_weight(&engine, 0, 0, giw1);
    HTS_Engine_set_gv_interpolation_weight(&engine, 0, 1, giw2);
     */
    HTS_Engine_set_gv_weight(&engine, 0, gvw1);
    HTS_Engine_set_gv_weight(&engine, 1, gvw2);

    text = DealWithText(text);
    CFSArray<CFSWString> res = do_utterances(text);

    INTPTR data_size = 0;
    outfp = fopen(output_fname, "wb");
    if (write_durlabel) durfp = fopen(dur_fname, "w");
    if (!write_raw) HTS_Engine_write_header(&engine, outfp, 1);
    for (INTPTR i = 0; i < res.GetSize(); i++) {

        CFSArray<CFSWString> label = do_all(res[i], print_label, print_utt);

        std::vector<std::string> v;
        v = to_vector(label);

        std::vector<char*> vc;
        fill_char_vector(v, vc);

        size_t n_lines = vc.size();

        if (HTS_Engine_synthesize_from_strings(&engine, &vc[0], n_lines) != TRUE) {
            fprintf(stderr, "Viga: süntees ebaonnestus.\n");            
            HTS_Engine_clear(&engine);
            exit(1);
        }

        clean_char_vector(vc);
        data_size += HTS_Engine_engine_speech_size(&engine);
        if (write_durlabel) HTS_Engine_save_durlabel(&engine, durfp);
        HTS_Engine_save_generated_speech(&engine, outfp);

        HTS_Engine_refresh(&engine);

    } //synth loop
    
    if (!write_raw) HTS_Engine_write_header(&engine, outfp, data_size);
    if (write_durlabel) fclose(durfp);
    fclose(outfp);

    HTS_Engine_clear(&engine);
    Linguistic.Close();

    FSCTerminate();
    return 0;

}
Ejemplo n.º 12
0
/* HTS_Engine_synthesize_from_strings: synthesize speech from strings */
HTS_Boolean HTS_Engine_synthesize_from_strings(HTS_Engine * engine, char **lines, size_t num_lines)
{
   HTS_Engine_refresh(engine);
   HTS_Label_load_from_strings(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, lines, num_lines);
   return HTS_Engine_synthesize(engine);
}
Ejemplo n.º 13
0
/* HTS_Engine_synthesize_from_fn: synthesize speech from file name */
HTS_Boolean HTS_Engine_synthesize_from_fn(HTS_Engine * engine, const char *fn)
{
   HTS_Engine_refresh(engine);
   HTS_Label_load_from_fn(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, fn);
   return HTS_Engine_synthesize(engine);
}
Ejemplo n.º 14
0
int main(int argc, char **argv)
{
   int i;
   double f;
   char *labfn = NULL;
   size_t labfn_size=0;
   char *guifn = NULL;
   FILE *guifp = NULL;
   FILE *rawfp = NULL;

   /* number of speakers for interpolation */
   int num_interp = 0;
   double *rate_interp = NULL;

   /* file names of models */
   char **fn_ms_dur;
   char **fn_ms_mgc;
   char **fn_ms_lf0;
   char **fn_ms_lpf;
   /* number of each models for interpolation */
   int num_ms_dur = 0, num_ms_mgc = 0, num_ms_lf0 = 0, num_ms_lpf = 0;

   /* file names of trees */
   char **fn_ts_dur;
   char **fn_ts_mgc;
   char **fn_ts_lf0;
   char **fn_ts_lpf;
   /* number of each trees for interpolation */
   int num_ts_dur = 0, num_ts_mgc = 0, num_ts_lf0 = 0, num_ts_lpf = 0;

   /* file names of windows */
   char **fn_ws_mgc;
   char **fn_ws_lf0;
   char **fn_ws_lpf;
   int num_ws_mgc = 0, num_ws_lf0 = 0, num_ws_lpf = 0;

   /* file names of global variance */
   char **fn_ms_gvm = NULL;
   char **fn_ms_gvl = NULL;
   char **fn_ms_gvf = NULL;
   int num_ms_gvm = 0, num_ms_gvl = 0, num_ms_gvf = 0;

   /* file names of global variance trees */
   char **fn_ts_gvm = NULL;
   char **fn_ts_gvl = NULL;
   char **fn_ts_gvf = NULL;
   int num_ts_gvm = 0, num_ts_gvl = 0, num_ts_gvf = 0;

   /* file name of global variance switch */
   char *fn_gv_switch = NULL;

   /* global parameter */
   int sampling_rate = 16000;
   int fperiod = 80;
   double alpha = 0.42;
   int stage = 0;               /* Gamma=-1/stage: if stage=0 then Gamma=0 */
   double beta = 0.0;
   int audio_buff_size = 1600;
   double uv_threshold = 0.5;
   double gv_weight_mgc = 1.0;
   double gv_weight_lf0 = 1.0;
   double gv_weight_lpf = 1.0;

   double half_tone = 0.0;
   HTS_Boolean phoneme_alignment = FALSE;
   double speech_speed = 1.0;
   HTS_Boolean use_log_gain = FALSE;

   /* engine */
   HTS_Engine engine;

   /* parse command line */
   if (argc == 1)
      Usage();

   /* delta window handler for mel-cepstrum */
   fn_ws_mgc = (char **) calloc(argc, sizeof(char *));
   /* delta window handler for log f0 */
   fn_ws_lf0 = (char **) calloc(argc, sizeof(char *));
   /* delta window handler for low-pass filter */
   fn_ws_lpf = (char **) calloc(argc, sizeof(char *));

   /* prepare for interpolation */
   num_interp = GetNumInterp(argc, argv);
   rate_interp = (double *) calloc(num_interp, sizeof(double));
   for (i = 0; i < num_interp; i++)
      rate_interp[i] = 1.0;

   fn_ms_dur = (char **) calloc(num_interp, sizeof(char *));
   fn_ms_mgc = (char **) calloc(num_interp, sizeof(char *));
   fn_ms_lf0 = (char **) calloc(num_interp, sizeof(char *));
   fn_ms_lpf = (char **) calloc(num_interp, sizeof(char *));
   fn_ts_dur = (char **) calloc(num_interp, sizeof(char *));
   fn_ts_mgc = (char **) calloc(num_interp, sizeof(char *));
   fn_ts_lf0 = (char **) calloc(num_interp, sizeof(char *));
   fn_ts_lpf = (char **) calloc(num_interp, sizeof(char *));
   fn_ms_gvm = (char **) calloc(num_interp, sizeof(char *));
   fn_ms_gvl = (char **) calloc(num_interp, sizeof(char *));
   fn_ms_gvf = (char **) calloc(num_interp, sizeof(char *));
   fn_ts_gvm = (char **) calloc(num_interp, sizeof(char *));
   fn_ts_gvl = (char **) calloc(num_interp, sizeof(char *));
   fn_ts_gvf = (char **) calloc(num_interp, sizeof(char *));

   /* read command */
   while (--argc) {
      if (**++argv == '-') {
         switch (*(*argv + 1)) {
         case 'v':
            switch (*(*argv + 2)) {
            case 'p':
               phoneme_alignment = TRUE;
               break;
            default:
               Error(1, "hts_engine: Invalid option '-v%c'.\n", *(*argv + 2));
            }
            break;
         case 't':
            switch (*(*argv + 2)) {
            case 'd':
               fn_ts_dur[num_ts_dur++] = *++argv;
               break;
            case 'm':
               fn_ts_mgc[num_ts_mgc++] = *++argv;
               break;
            case 'f':
            case 'p':
               fn_ts_lf0[num_ts_lf0++] = *++argv;
               break;
            case 'l':
               fn_ts_lpf[num_ts_lpf++] = *++argv;
               break;
            default:
               Error(1, "hts_engine: Invalid option '-t%c'.\n", *(*argv + 2));
            }
            --argc;
            break;
         case 'm':
            switch (*(*argv + 2)) {
            case 'd':
               fn_ms_dur[num_ms_dur++] = *++argv;
               break;
            case 'm':
               fn_ms_mgc[num_ms_mgc++] = *++argv;
               break;
            case 'f':
            case 'p':
               fn_ms_lf0[num_ms_lf0++] = *++argv;
               break;
            case 'l':
               fn_ms_lpf[num_ms_lpf++] = *++argv;
               break;
            default:
               Error(1, "hts_engine: Invalid option '-m%c'.\n", *(*argv + 2));
            }
            --argc;
            break;
         case 'd':
            switch (*(*argv + 2)) {
            case 'm':
               fn_ws_mgc[num_ws_mgc++] = *++argv;
               break;
            case 'f':
            case 'p':
               fn_ws_lf0[num_ws_lf0++] = *++argv;
               break;
            case 'l':
               fn_ws_lpf[num_ws_lpf++] = *++argv;
               break;
            default:
               Error(1, "hts_engine: Invalid option '-d%c'.\n", *(*argv + 2));
            }
            --argc;
            break;
         case 'o':
            switch (*(*argv + 2)) {
            case 'r':
               rawfp = Getfp(*++argv, "ab");
               break;
            default:
               Error(1, "festcat-hts_engine: Invalid option '-o%c'.\n", *(*argv + 2));
            }
            --argc;
            break;
         case 'h':
            Usage();
            break;
         case 's':
            sampling_rate = atoi(*++argv);
            --argc;
            break;
         case 'p':
            fperiod = atoi(*++argv);
            --argc;
            break;
         case 'a':
            alpha = atof(*++argv);
            --argc;
            break;
         case 'g':
            stage = atoi(*++argv);
            --argc;
            break;
         case 'l':
            use_log_gain = TRUE;
            break;
         case 'b':
            beta = atof(*++argv);
            --argc;
            break;
         case 'r':
            speech_speed = atof(*++argv);
            --argc;
            break;
         case 'f':
            switch (*(*argv + 2)) {
            case 'm':
               f = atof(*++argv);
               if (f < -24.0)
                  f = -24.0;
               if (f > 24.0)
                  f = 24.0;
               half_tone = f;
               break;
            default:
               Error(1, "hts_engine: Invalid option '-f%c'.\n", *(*argv + 2));
            }
            --argc;
            break;
         case 'u':
            uv_threshold = atof(*++argv);
            --argc;
            break;
         case 'i':
            ++argv;
            argc--;
            for (i = 0; i < num_interp; i++) {
               rate_interp[i] = atof(*++argv);
               argc--;
            }
            break;
         case 'e':
            switch (*(*argv + 2)) {
            case 'm':
               fn_ts_gvm[num_ts_gvm++] = *++argv;
               break;
            case 'f':
            case 'p':
               fn_ts_gvl[num_ts_gvl++] = *++argv;
               break;
            case 'l':
               fn_ts_gvf[num_ts_gvf++] = *++argv;
               break;
            default:
               Error(1, "hts_engine: Invalid option '-e%c'.\n", *(*argv + 2));
            }
            --argc;
            break;
         case 'c':
            switch (*(*argv + 2)) {
            case 'm':
               fn_ms_gvm[num_ms_gvm++] = *++argv;
               break;
            case 'f':
            case 'p':
               fn_ms_gvl[num_ms_gvl++] = *++argv;
               break;
            case 'l':
               fn_ms_gvf[num_ms_gvf++] = *++argv;
               break;
            default:
               Error(1, "hts_engine: Invalid option '-c%c'.\n", *(*argv + 2));
            }
            --argc;
            break;
         case 'j':
            switch (*(*argv + 2)) {
            case 'm':
               gv_weight_mgc = atof(*++argv);
               break;
            case 'f':
            case 'p':
               gv_weight_lf0 = atof(*++argv);
               break;
            case 'l':
               gv_weight_lpf = atof(*++argv);
               break;
            default:
               Error(1, "hts_engine: Invalid option '-j%c'.\n", *(*argv + 2));
            }
            --argc;
            break;
         case 'k':
            fn_gv_switch = *++argv;
            --argc;
            break;
         case 'z':
            audio_buff_size = atoi(*++argv);
            --argc;
            break;
         default:
            Error(1, "hts_engine: Invalid option '-%c'.\n", *(*argv + 1));
         }
      } else {
         guifn = *argv;
      }
   }
   /* number of models,trees check */
   if (num_interp != num_ts_dur || num_interp != num_ts_mgc ||
       num_interp != num_ts_lf0 || num_interp != num_ms_dur ||
       num_interp != num_ms_mgc || num_interp != num_ms_lf0) {
      Error(1, "hts_engine: specify %d models(trees) for each parameter.\n",
            num_interp);
   }
   if (num_ms_lpf > 0 || num_ts_lpf > 0) {
      if (num_interp != num_ms_lpf || num_interp != num_ts_lpf) {
         Error(1, "hts_engine: specify %d models(trees) for each parameter.\n",
               num_interp);
      }
   }

   /* initialize (stream[0] = spectrum, stream[1] = lf0, stream[2] = low-pass filter) */
   if (num_ms_lpf > 0 || num_ts_lpf > 0) {
      HTS_Engine_initialize(&engine, 3);
   } else {
      HTS_Engine_initialize(&engine, 2);
   }

   /* load duration model */
   HTS_Engine_load_duration_from_fn(&engine, fn_ms_dur, fn_ts_dur, num_interp);
   /* load stream[0] (spectrum model) */
   HTS_Engine_load_parameter_from_fn(&engine, fn_ms_mgc, fn_ts_mgc, fn_ws_mgc,
                                     0, FALSE, num_ws_mgc, num_interp);
   /* load stream[1] (lf0 model) */
   HTS_Engine_load_parameter_from_fn(&engine, fn_ms_lf0, fn_ts_lf0, fn_ws_lf0,
                                     1, TRUE, num_ws_lf0, num_interp);
   /* load stream[2] (low-pass filter model) */
   if (num_ms_lpf > 0 || num_ts_lpf > 0)
      HTS_Engine_load_parameter_from_fn(&engine, fn_ms_lpf, fn_ts_lpf,
                                        fn_ws_lpf, 2, FALSE, num_ws_lpf,
                                        num_interp);
   /* load gv[0] (GV for spectrum) */
   if (num_interp == num_ms_gvm) {
      if (num_ms_gvm == num_ts_gvm)
         HTS_Engine_load_gv_from_fn(&engine, fn_ms_gvm, fn_ts_gvm, 0,
                                    num_interp);
      else
         HTS_Engine_load_gv_from_fn(&engine, fn_ms_gvm, NULL, 0, num_interp);
   }
   /* load gv[1] (GV for lf0) */
   if (num_interp == num_ms_gvl) {
      if (num_ms_gvl == num_ts_gvl)
         HTS_Engine_load_gv_from_fn(&engine, fn_ms_gvl, fn_ts_gvl, 1,
                                    num_interp);
      else
         HTS_Engine_load_gv_from_fn(&engine, fn_ms_gvl, NULL, 1, num_interp);
   }
   /* load gv[2] (GV for low-pass filter) */
   if (num_interp == num_ms_gvf && (num_ms_lpf > 0 || num_ts_lpf > 0)) {
      if (num_ms_gvf == num_ts_gvf)
         HTS_Engine_load_gv_from_fn(&engine, fn_ms_gvf, fn_ts_gvf, 0,
                                    num_interp);
      else
         HTS_Engine_load_gv_from_fn(&engine, fn_ms_gvf, NULL, 2, num_interp);
   }
   /* load GV switch */
   if (fn_gv_switch != NULL)
      HTS_Engine_load_gv_switch_from_fn(&engine, fn_gv_switch);

   /* set parameter */
   HTS_Engine_set_sampling_rate(&engine, sampling_rate);
   HTS_Engine_set_fperiod(&engine, fperiod);
   HTS_Engine_set_alpha(&engine, alpha);
   HTS_Engine_set_gamma(&engine, stage);
   HTS_Engine_set_log_gain(&engine, use_log_gain);
   HTS_Engine_set_beta(&engine, beta);
   HTS_Engine_set_audio_buff_size(&engine, audio_buff_size);
   HTS_Engine_set_msd_threshold(&engine, 1, uv_threshold);      /* set voiced/unvoiced threshold for stream[1] */
   HTS_Engine_set_gv_weight(&engine, 0, gv_weight_mgc);
   HTS_Engine_set_gv_weight(&engine, 1, gv_weight_lf0);
   if (num_ms_lpf > 0 || num_ts_lpf > 0)
      HTS_Engine_set_gv_weight(&engine, 2, gv_weight_lpf);
   for (i = 0; i < num_interp; i++) {
      HTS_Engine_set_duration_interpolation_weight(&engine, i, rate_interp[i]);
      HTS_Engine_set_parameter_interpolation_weight(&engine, 0, i,
                                                    rate_interp[i]);
      HTS_Engine_set_parameter_interpolation_weight(&engine, 1, i,
                                                    rate_interp[i]);
      if (num_ms_lpf > 0 || num_ts_lpf > 0)
         HTS_Engine_set_parameter_interpolation_weight(&engine, 2, i,
                                                       rate_interp[i]);
   }
   if (num_interp == num_ms_gvm)
      for (i = 0; i < num_interp; i++)
         HTS_Engine_set_gv_interpolation_weight(&engine, 0, i, rate_interp[i]);
   if (num_interp == num_ms_gvl)
      for (i = 0; i < num_interp; i++)
         HTS_Engine_set_gv_interpolation_weight(&engine, 1, i, rate_interp[i]);
   if (num_interp == num_ms_gvf && (num_ms_lpf > 0 || num_ts_lpf > 0))
      for (i = 0; i < num_interp; i++)
         HTS_Engine_set_gv_interpolation_weight(&engine, 2, i, rate_interp[i]);

   /* synthesis */
   guifp=Getfp(guifn, "r");
   while (getline(&labfn,&labfn_size,guifp) != -1)
   {
      chomp(labfn);
	   HTS_Engine_load_label_from_fn(&engine, labfn);       /* load label file */
	   if (phoneme_alignment)       /* modify label */
		  HTS_Label_set_frame_specified_flag(&engine.label, TRUE);
	   if (speech_speed != 1.0)     /* modify label */
		  HTS_Label_set_speech_speed(&engine.label, speech_speed);
	   HTS_Engine_create_sstream(&engine);  /* parse label and determine state duration */
	   if (half_tone != 0.0) {      /* modify f0 */
		  for (i = 0; i < HTS_SStreamSet_get_total_state(&engine.sss); i++) {
			 f = HTS_SStreamSet_get_mean(&engine.sss, 1, i, 0);
			 f += half_tone * log(2.0) / 12;
			 if (f < log(10.0))
				f = log(10.0);
			 HTS_SStreamSet_set_mean(&engine.sss, 1, i, 0, f);
		  }
	   }
	   HTS_Engine_create_pstream(&engine);  /* generate speech parameter vector sequence */
	   HTS_Engine_create_gstream(&engine);  /* synthesize speech */

	   /* output */
	   if (rawfp)
		  HTS_Engine_save_generated_speech(&engine, rawfp);

	   /* free */
	   HTS_Engine_refresh(&engine);
	}
	if (guifp != NULL)
		fclose(guifp);
	free(labfn);
   /* free memory */
   HTS_Engine_clear(&engine);
   free(rate_interp);
   free(fn_ws_mgc);
   free(fn_ws_lf0);
   free(fn_ws_lpf);
   free(fn_ms_mgc);
   free(fn_ms_lf0);
   free(fn_ms_lpf);
   free(fn_ms_dur);
   free(fn_ts_mgc);
   free(fn_ts_lf0);
   free(fn_ts_lpf);
   free(fn_ts_dur);
   free(fn_ms_gvm);
   free(fn_ms_gvl);
   free(fn_ms_gvf);
   free(fn_ts_gvm);
   free(fn_ts_gvl);
   free(fn_ts_gvf);

   /* close files */
   if (rawfp != NULL)
      fclose(rawfp);

   return 0;
}
Ejemplo n.º 15
0
 void std_hts_engine_impl::do_reset()
 {
   HTS_Engine_set_stop_flag(engine.get(),false);
   HTS_Engine_refresh(engine.get());
   HTS_Engine_add_half_tone(engine.get(),0);
 }
Ejemplo n.º 16
0
int htsSynthesize(int argc, char **argv)
{
	int i;
	double f;

	/* hts_engine API */
	HTS_Engine engine;

	/* HTS voices */
	size_t num_voices;
	char **fn_voices;

	/* input label file name */
	char *labfn = NULL;

	/* output file pointers */
	FILE *durfp = NULL, *mgcfp = NULL, *lf0fp = NULL, *lpffp = NULL, *wavfp = NULL, *rawfp = NULL, *tracefp = NULL;

	/* interpolation weights */
	size_t num_interpolation_weights;

	/* output usage */
	if (argc <= 1)
		usage();

	/* initialize hts_engine API */
	HTS_Engine_initialize(&engine);

	/* get HTS voice file names */
	num_voices = 0;
	fn_voices = (char **) malloc(argc * sizeof(char *));
	for (i = 0; i < argc; i++)
	{
		if (argv[i][0] == '-' && argv[i][1] == 'm')
			fn_voices[num_voices++] = argv[++i];
		if (argv[i][0] == '-' && argv[i][1] == 'h')
			usage();
	}
	if (num_voices == 0)
	{
		fprintf(stderr, "Error: HTS voice must be specified.\n");
		free(fn_voices);
		return (-1);
	}

	/* load HTS voices */
	if (HTS_Engine_load(&engine, fn_voices, num_voices) != TRUE)
	{
		fprintf(stderr, "Error: HTS voices cannot be loaded.\n");
		free(fn_voices);
		HTS_Engine_clear(&engine);
		return (-1);
	}
	free(fn_voices);

	/* get options */
	while (--argc)
	{
		if (**++argv == '-')
		{
			switch (*(*argv + 1))
			{
			case 'v':
				switch (*(*argv + 2))
				{
				case 'p':
					HTS_Engine_set_phoneme_alignment_flag(&engine, TRUE);
					break;
				default:
					fprintf(stderr, "Error: Invalid option '-v%c'.\n", *(*argv + 2));
					HTS_Engine_clear(&engine);
					return (-1);
				}
				break;
			case 'o':
				switch (*(*argv + 2))
				{
				case 'w':
					wavfp = fopen(*++argv, "wb");
					break;
				case 'r':
					rawfp = fopen(*++argv, "wb");
					break;
				case 'd':
					durfp = fopen(*++argv, "wt");
					break;
				case 'm':
					mgcfp = fopen(*++argv, "wb");
					break;
				case 'f':
				case 'p':
					lf0fp = fopen(*++argv, "wb");
					break;
				case 'l':
					lpffp = fopen(*++argv, "wb");
					break;
				case 't':
					tracefp = fopen(*++argv, "wt");
					break;
				default:
					fprintf(stderr, "Error: Invalid option '-o%c'.\n", *(*argv + 2));
					HTS_Engine_clear(&engine);
					return (-1);
				}
				--argc;
				break;
			case 'h':
				usage();
				break;
			case 'm':
				argv++; /* HTS voices were already loaded */
				--argc;
				break;
			case 's':
				HTS_Engine_set_sampling_frequency(&engine, (size_t) atoi(*++argv));
				--argc;
				break;
			case 'p':
				HTS_Engine_set_fperiod(&engine, (size_t) atoi(*++argv));
				--argc;
				break;
			case 'a':
				HTS_Engine_set_alpha(&engine, atof(*++argv));
				--argc;
				break;
			case 'b':
				HTS_Engine_set_beta(&engine, atof(*++argv));
				--argc;
				break;
			case 'r':
				HTS_Engine_set_speed(&engine, atof(*++argv));
				--argc;
				break;
			case 'f':
				switch (*(*argv + 2))
				{
				case 'm':
					HTS_Engine_add_half_tone(&engine, atof(*++argv));
					break;
				default:
					fprintf(stderr, "Error: Invalid option '-f%c'.\n", *(*argv + 2));
					HTS_Engine_clear(&engine);
					return (-1);
				}
				--argc;
				break;
			case 'u':
				HTS_Engine_set_msd_threshold(&engine, 1, atof(*++argv));
				--argc;
				break;
			case 'i':
				num_interpolation_weights = atoi(*++argv);
				argc--;
				if (num_interpolation_weights != num_voices)
				{
					HTS_Engine_clear(&engine);
					return(-1);
				}
				for (i = 0; i < (int) num_interpolation_weights; i++)
				{
					f = atof(*++argv);
					argc--;
					HTS_Engine_set_duration_interpolation_weight(&engine, i, f);
					HTS_Engine_set_parameter_interpolation_weight(&engine, i, 0, f);
					HTS_Engine_set_parameter_interpolation_weight(&engine, i, 1, f);
					HTS_Engine_set_gv_interpolation_weight(&engine, i, 0, f);
					HTS_Engine_set_gv_interpolation_weight(&engine, i, 1, f);
				}
				break;
			case 'j':
				switch (*(*argv + 2))
				{
				case 'm':
					HTS_Engine_set_gv_weight(&engine, 0, atof(*++argv));
					break;
				case 'f':
				case 'p':
					HTS_Engine_set_gv_weight(&engine, 1, atof(*++argv));
					break;
				default:
					fprintf(stderr, "Error: Invalid option '-j%c'.\n", *(*argv + 2));
					HTS_Engine_clear(&engine);
					return(-1);
				}
				--argc;
				break;
			case 'g':
				HTS_Engine_set_volume(&engine, atof(*++argv));
				--argc;
				break;
			case 'z':
				HTS_Engine_set_audio_buff_size(&engine, (size_t) atoi(*++argv));
				--argc;
				break;
			default:
				fprintf(stderr, "Error: Invalid option '-%c'.\n", *(*argv + 1));
				HTS_Engine_clear(&engine);
				return(-1);
			}
		}
		else
		{
			labfn = *argv;
		}
	}

	/* synthesize */
	if (HTS_Engine_synthesize_from_fn(&engine, labfn) != TRUE)
	{
		fprintf(stderr, "Error: waveform cannot be synthesized.\n");
		HTS_Engine_clear(&engine);
		return(-1);
	}

	/* output */
	if (tracefp != NULL)
		HTS_Engine_save_information(&engine, tracefp);
	if (durfp != NULL)
		HTS_Engine_save_label(&engine, durfp);
	if (rawfp)
		HTS_Engine_save_generated_speech(&engine, rawfp);
	if (wavfp)
		HTS_Engine_save_riff(&engine, wavfp);
	if (mgcfp)
		HTS_Engine_save_generated_parameter(&engine, 0, mgcfp);
	if (lf0fp)
		HTS_Engine_save_generated_parameter(&engine, 1, lf0fp);
	if (lpffp)
		HTS_Engine_save_generated_parameter(&engine, 2, lpffp);

	/* reset */
	HTS_Engine_refresh(&engine);

	/* free memory */
	HTS_Engine_clear(&engine);

	/* close files */
	if (durfp != NULL)
		fclose(durfp);
	if (mgcfp != NULL)
		fclose(mgcfp);
	if (lf0fp != NULL)
		fclose(lf0fp);
	if (lpffp != NULL)
		fclose(lpffp);
	if (wavfp != NULL)
		fclose(wavfp);
	if (rawfp != NULL)
		fclose(rawfp);
	if (tracefp != NULL)
		fclose(tracefp);

	return 0;
}
Ejemplo n.º 17
0
static void Run(const SUttProcessor *self, SUtterance *utt,
				s_erc *error)
{
	SHTSEngineMESynthUttProc105 *HTSsynth = (SHTSEngineMESynthUttProc105*)self;
	SPlugin *audioPlugin;
	const SRelation *segmentRel;
	SAudio *audio = NULL;
	s_bool is_present;
	char **label_data = NULL;
	int label_size;
	const SItem *item;
	const SItem *itemItr;
	int counter;
	uint i;
	int frame;
	int state;
	const double rate = HTSsynth->engine.global.fperiod * 1e+7 / HTSsynth->engine.global.sampling_rate;
	int nstate;


	S_CLR_ERR(error);

	/* we require the segment relation */
	is_present = SUtteranceRelationIsPresent(utt, "Segment", error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SUtteranceRelationIsPresent\" failed"))
		goto quit_error;

	if (!is_present)
	{
		S_CTX_ERR(error, S_FAILURE,
				  "Run",
				  "Failed to find 'Segment' relation in utterance");
		goto quit_error;
	}

	segmentRel = SUtteranceGetRelation(utt, "Segment", error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SUtteranceGetRelation\" failed"))
		goto quit_error;

	item = SRelationHead(segmentRel, error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SRelationHead\" failed"))
		goto quit_error;

	itemItr = item;
	label_size = 0;
	while (itemItr != NULL)
	{
		label_size++;
		itemItr = SItemNext(itemItr, error);
	}

	label_data = S_CALLOC(char*, label_size);

	itemItr = item;
	counter = 0;
	while (itemItr != NULL)
	{
		SObject *dFeat;
		const char *tmp;


		dFeat = SItemPathToFeatProc(itemItr, "hts_labels", error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"SItemPathToFeatProc\" failed"))
			goto quit_error;

		if (dFeat == NULL)
		{
			S_CTX_ERR(error, S_FAILURE,
					  "Run",
					  "Failed to generate hts labels for segment item");
			goto quit_error;
		}

		tmp = SObjectGetString(dFeat, error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"SObjectGetString\" failed"))
			goto quit_error;

		label_data[counter++] = s_strdup(tmp, error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"s_strdup\" failed"))
			goto quit_error;

		SItemSetObject((SItem*)itemItr, "hts_label", dFeat, error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"SItemSetObject\" failed"))
			goto quit_error;

		itemItr = SItemNext(itemItr, error);
	}

	/* speech synthesis part */
	HTS_Engine_load_label_from_string_list(&(HTSsynth->engine), label_data, label_size);
	check_and_change_rate_volume(HTSsynth, utt, error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"check_and_change_rate_volume\" failed"))
		goto quit_error;

	HTS_Engine_create_sstream(&(HTSsynth->engine));
	check_and_change_tone(HTSsynth, utt, error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"check_and_change_tone\" failed"))
		goto quit_error;

	HTS_Engine_create_pstream(&(HTSsynth->engine));

	if (HTSsynth->me == TRUE) /* mixed excitation */
	{
		HTS_Engine_create_gstream_me(&(HTSsynth->engine),
									 HTSsynth->me_num_filters, HTSsynth->me_filter_order,
									 HTSsynth->me_filter, HTSsynth->xp_sig, HTSsynth->xn_sig,
									 HTSsynth->hp, HTSsynth->hn,
									 HTSsynth->pd_filter, HTSsynth->pd_filter_order);
	}
	else
	{
		HTS_Engine_create_gstream(&(HTSsynth->engine));
	}

	nstate = HTS_Speect_ModelSet_get_nstate(&(HTSsynth->engine));
	itemItr = item;
	counter = 0;
	frame = 0;
	state = 0;
	while (itemItr != NULL)
	{
		int j;
		int duration = 0;
		float tmp;

		for (j = 0; j < nstate; j++)
			duration += HTS_Speect_SStreamSet_get_duration(&(HTSsynth->engine), state++);

		tmp = frame * rate;
		SItemSetFloat((SItem*)itemItr, "start", tmp/1e+7, error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"SItemSetFloat\" failed"))
			goto quit_error;

		tmp = (frame + duration) * rate;
		SItemSetFloat((SItem*)itemItr, "end", tmp/1e+7, error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"SItemSetFloat\" failed"))
			goto quit_error;

		frame += duration;
		itemItr = SItemNext(itemItr, error);
		counter++;
	}

	/* We need to give the utterance the audio plug-in. If we don't do
	 * this and the voice is deleted before the utterance, then the
	 * utterance can't do *anything* with the audio. Not even delete
	 * it (segfault). This should be fast because it is already
	 * loaded.
	 * Note that this happens before the audio is set. This is because
	 * utt features are a list implementation.
	 */
	audioPlugin = s_pm_load_plugin("audio.spi", error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SUtteranceSetFeature\" failed"))
		goto quit_error;

	SUtteranceSetFeature(utt, "audio_plugin", S_OBJECT(audioPlugin), error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SUtteranceSetFeature\" failed"))
	{
		S_DELETE(audioPlugin, "Run", error);
		goto quit_error;
	}

	/* create an audio object */
	audio = S_NEW(SAudio, error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Failed to create new 'SAudio' object"))
		goto quit_error;

	/* set audio feature in utterance */
	SUtteranceSetFeature(utt, "audio", S_OBJECT(audio), error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SUtteranceSetFeature\" failed"))
	{
		S_DELETE(audio, "Run", error);
		goto quit_error;
    }

	audio->sample_rate = HTSsynth->engine.global.sampling_rate;
	audio->num_samples = (uint32)HTS_Speect_GStreamSet_get_total_nsample(&(HTSsynth->engine));
	audio->samples = S_MALLOC(float, audio->num_samples);
	if (audio->samples == NULL)
	{
		S_FTL_ERR(error, S_MEMERROR,
				  "Run",
				  "Failed to allocate memory for 'float' object");
		goto quit_error;
	}

	/* write data */
	for (i = 0; i < audio->num_samples; i++)
		audio->samples[i] = (float)(HTS_Speect_GStreamSet_get_speech(&(HTSsynth->engine), i) * 1.0);

	for (counter = 0; counter < label_size; counter++)
		S_FREE(label_data[counter]);
	S_FREE(label_data);

	HTS_Engine_refresh(&(HTSsynth->engine));

	/* all OK here */
	return;

	/* error clean-up code */
quit_error:
	if (label_data != NULL)
	{
		for (counter = 0; counter < label_size; counter++)
		{
			if (label_data[counter] != NULL)
				S_FREE(label_data[counter]);
		}

		S_FREE(label_data);
	}

	return;
}