/* Flite_HTS_Engine_synthesize: synthesize speech */ HTS_Boolean Flite_HTS_Engine_synthesize(Flite_HTS_Engine * f, const char *txt, const char *wav) { int i; FILE *fp; cst_voice *v = NULL; cst_utterance *u = NULL; cst_item *s = NULL; char **label_data = NULL; int label_size = 0; if (txt == NULL) return FALSE; /* text analysis part */ v = REGISTER_VOX(NULL); if (v == NULL) return FALSE; u = flite_synth_text(txt, v); if (u == NULL) { UNREGISTER_VOX(v); return FALSE; } for (s = relation_head(utt_relation(u, "Segment")); s; s = item_next(s)) label_size++; if (label_size <= 0) { delete_utterance(u); UNREGISTER_VOX(v); return FALSE; } label_data = (char **) calloc(label_size, sizeof(char *)); for (i = 0, s = relation_head(utt_relation(u, "Segment")); s; s = item_next(s), i++) { label_data[i] = (char *) calloc(MAXBUFLEN, sizeof(char)); Flite_HTS_Engine_create_label(f, s, label_data[i]); } /* speech synthesis part */ HTS_Engine_synthesize_from_strings(&f->engine, label_data, label_size); if (wav != NULL) { fp = fopen(wav, "wb"); HTS_Engine_save_riff(&f->engine, fp); fclose(fp); } HTS_Engine_refresh(&f->engine); for (i = 0; i < label_size; i++) free(label_data[i]); free(label_data); delete_utterance(u); UNREGISTER_VOX(v); return TRUE; }
void test_hrg(void) { cst_utterance *u; cst_relation *r; cst_item *item = 0; int i; u = new_utterance(); r = utt_relation_create(u, "Segment"); for (i = 0; i < 10; i++) { char buff[20]; sprintf(buff, "seg_%03d", i); if (i == 0) item = relation_append(r, NULL); else item = item_append(item, NULL); item_set_string(item, "name", buff); item_set_float(item, "duration", i * 0.20); } for (i = 0, item = relation_head(utt_relation(u, "Segment")); item; item = item_next(item), i++) { TEST_CHECK(item_feat_float(item, "duration") == correct_list[i]); } delete_utterance(u); }
int main(int argc, char **argv) { cst_utterance *u; cst_relation *r; cst_item *item=0; int i; u = new_utterance(); r = utt_relation_create(u,"Segment"); for (i=0; i<10; i++) { char buff[20]; sprintf(buff,"seg_%03d",i); if (i==0) item = relation_append(r,NULL); else item = item_append(item,NULL); item_set_string(item,"name",buff); item_set_float(item,"duration",i*0.20); } for (i=0,item=relation_head(utt_relation(u,"Segment")); item; item=item_next(item),i++) { printf("Segment %d %s %f\n", i, item_feat_string(item,"name"), item_feat_float(item,"duration")); } delete_utterance(u); return 0; }
int main(int argc, char **argv) { cst_utterance *u; cst_relation *r; cst_item *item=0; int i; cmu_lex_init(); u = new_utterance(); r = utt_relation_create(u,"Word"); bbb_relation_load(r,"ttt.txt"); WordSylSeg(u); for (i=0,item=item_next(relation_head(utt_relation(u,"Segment"))); item; item=item_next(item),i++) { printf("Segment %s %s %s %s\n", ffeature_string(item,"name"), ffeature_string(item,"n.name"), ffeature_string(item,"p.name"), ffeature_string(item,"R:SylStructure.parent.name") /* ffeature_string(item,"R:SylStructure.parent.R:Word.n.name"), */ /* item_feat_float(item,"duration")); */ ); } delete_utterance(u); return 0; }
cst_wave *flite_text_to_wave(const char *text, cst_voice *voice) { cst_utterance *u; cst_wave *w; if ((u = flite_synth_text(text,voice)) == NULL) return NULL; w = copy_wave(utt_wave(u)); delete_utterance(u); return w; }
float flite_phones_to_speech(const char *text, cst_voice *voice, const char *outtype) { cst_utterance *u; float dur; u = flite_synth_phones(text,voice); dur = flite_process_output(u,outtype,FALSE); delete_utterance(u); return dur; }
static cst_utterance *flite_synth_foo(cst_utterance *u, cst_voice *voice, cst_uttfunc synth) { utt_init(u, voice); if ((*synth)(u) == NULL) { delete_utterance(u); return NULL; } else return u; }
/* Flite_Text_Analyzer_analysis: text analysis */ void Flite_Text_Analyzer_analysis(Flite_Text_Analyzer * analyzer, const char *text) { int i; cst_item *s; Flite_Utterance *fu; if (analyzer == NULL || text == NULL) return; if (analyzer->pointer != NULL) Flite_Text_Analyzer_clear(analyzer); /* allocate */ fu = (Flite_Utterance *) malloc(sizeof(Flite_Utterance)); /* create voice */ fu->v = REGISTER_VOX(NULL); if (fu->v == NULL) { free(fu); return; } /* create utterance */ fu->u = flite_synth_text(text, fu->v); if (fu->u == NULL) { UNREGISTER_VOX(fu->v); free(fu); return; } /* count number of phonemes */ for (fu->nitem = 0, s = relation_head(utt_relation(fu->u, "Segment")); s; s = item_next(s), fu->nitem++); if (fu->nitem == 0) { delete_utterance(fu->u); UNREGISTER_VOX(fu->v); free(fu); return; } /* save informations */ fu->items = (cst_item **) malloc(sizeof(cst_item *) * fu->nitem); for (i = 0, s = relation_head(utt_relation(fu->u, "Segment")); s; s = item_next(s), i++) fu->items[i] = s; analyzer->pointer = (void *) fu; }
/* Flite_Text_Analyzer_clear: finalize flite front-end */ void Flite_Text_Analyzer_clear(Flite_Text_Analyzer * analyzer) { Flite_Utterance *fu; if (analyzer == NULL || analyzer->pointer == NULL) return; fu = (Flite_Utterance *) analyzer->pointer; if (fu->items != NULL) free(fu->items); if (fu->u != NULL) delete_utterance(fu->u); if (fu->v != NULL) UNREGISTER_VOX(fu->v); free(fu); analyzer->pointer = NULL; }
/* Flite_HTS_Engine_synthesis: speech synthesis */ void Flite_HTS_Engine_synthesis(Flite_HTS_Engine * f, char *txt, FILE * wavfp) { int i; cst_voice *v = NULL; cst_utterance *u = NULL; cst_item *s = NULL; char **label_data = NULL; int label_size = 0; /* text analysis part */ v = REGISTER_VOX(NULL); if (v == NULL) return; u = flite_synth_text(txt, v); if (u == NULL) return; for (s = relation_head(utt_relation(u, "Segment")); s; s = item_next(s)) label_size++; if (label_size <= 0) return; label_data = (char **) calloc(label_size, sizeof(char *)); for (i = 0, s = relation_head(utt_relation(u, "Segment")); s; s = item_next(s), i++) { label_data[i] = (char *) calloc(MAXBUFLEN, sizeof(char)); Flite_HTS_Engine_create_label(f, s, label_data[i]); } /* speech synthesis part */ HTS_Engine_load_label_from_string_list(&f->engine, label_data, label_size); HTS_Engine_create_sstream(&f->engine); HTS_Engine_create_pstream(&f->engine); HTS_Engine_create_gstream(&f->engine); if (wavfp != NULL) HTS_Engine_save_riff(&f->engine, wavfp); HTS_Engine_refresh(&f->engine); for (i = 0; i < label_size; i++) free(label_data[i]); free(label_data); delete_utterance(u); UNREGISTER_VOX(v); }
float flite_tokens_to_speech(cst_utterance *u, cst_voice *voice, const char *outtype) { cst_wave *w; float durs; u = flite_synth_foo(u,voice,utt_synth_tokens); if (u == NULL) return -1; w = utt_wave(u); durs = (float)w->num_samples/(float)w->sample_rate; if (cst_streq(outtype,"play")) play_wave(w); else if (!cst_streq(outtype,"none")) cst_wave_append_riff(w,outtype); delete_utterance(u); return durs; }
string getPhonemes( const char* sText ) { string sRet; cst_features* args = new_features(); cst_voice* v; cst_utterance* u; cst_item* s; const char* name; //const cst_val* d; flite_init(); v = register_cmu_us_no_wave( NULL ); u = flite_synth_text( sText, v ); for ( s = relation_head( utt_relation( u, "Segment" ) ); s; s = item_next( s ) ) { sRet += item_feat_string( s, "name" ); float test = item_feat_float( s, "end" ); //d = segment_duration( s ); /* If its a vowel and is stressed output stress value */ if ( ( cst_streq( "+", ffeature_string( s, "ph_vc" ) ) ) && ( cst_streq( "1", ffeature_string( s, "R:SylStructure.parent.stress" ) ) ) ) { sRet += "1"; } sRet += " "; } delete_utterance( u ); delete_features( args ); return sRet; }
cst_utterance *flowm_utt_callback(cst_utterance *u) { char rst[FL_MAX_MSG_CHARS]; const char *tok; cst_item *item; char *space; int extend_length; /* In order to stop the synthesizer if the STOP button is pressed */ /* This stops the synthesis of the next utterance */ if ((flowm_play_status == FLOWM_PLAY) || (flowm_play_status == FLOWM_SKIP)) { if (TTSWindow) { rst[0] = '\0'; space = ""; for (item=relation_head(utt_relation(u,"Token")); item; item=item_next(item)) { tok = item_feat_string(item,"name"); if (cst_streq("",space)) /* Only do this on the first token/word */ flowm_file_pos = item_feat_int(item,"file_pos"); extend_length = cst_strlen(rst) + 1 + cst_strlen(item_feat_string(item,"prepunctuation"))+ cst_strlen(item_feat_string(item,"punc")); if (cst_strlen(tok)+extend_length+4 < FL_MAX_MSG_CHARS) cst_sprintf(rst,"%s%s%s%s%s",rst,space, item_feat_string(item,"prepunctuation"), tok, item_feat_string(item,"punc")); else { if (cst_strlen(rst)+4 < FL_MAX_MSG_CHARS) cst_sprintf(rst,"%s ...",rst); break; } space = " "; } if (flowm_file_pos > flowm_prev_utt_pos[flowm_utt_pos_pos]) { if ((flowm_utt_pos_pos+1) >= FLOWM_NUM_UTT_POS) { /* Filled it up, so move it down */ memmove(flowm_prev_utt_pos,&flowm_prev_utt_pos[1], sizeof(int)*(FLOWM_NUM_UTT_POS-10)); flowm_utt_pos_pos = (FLOWM_NUM_UTT_POS-10); } flowm_utt_pos_pos++; flowm_prev_utt_pos[flowm_utt_pos_pos] = flowm_file_pos; } /* Send text to TTSWindow */ mbstowcs(fl_tts_msg,rst,FL_MAX_MSG_CHARS); SetDlgItemText(TTSWindow, FL_SYNTHTEXT, fl_tts_msg); /* Update file pos percentage in FilePos window */ cst_sprintf(rst,"%2.3f",flowm_find_file_percentage()); mbstowcs(fl_fp_msg,rst,FL_MAX_MSG_CHARS); SetDlgItemText(TTSWindow, FL_FILEPOS, fl_fp_msg); SystemIdleTimerReset(); /* keep alive while synthesizing */ if (flowm_play_status == FLOWM_SKIP) flowm_play_status = FLOWM_PLAY; } return u; } else { delete_utterance(u); return 0; } }
float flite_text_to_speech_phenome( const char* text, cst_voice* voice, const char* outtype, void* pStream ) { cst_utterance* u; float dur; float end_last = 0; float end_current = 0; float dur_current = 0; float dur_sum = 0; //feat_set_float( voice->features, "duration_stretch", 1 ); u = flite_synth_text( text, voice ); cst_item* s; string sRet; int nPhoneme = 0; for ( s = relation_head( utt_relation( u, "Segment" ) ); s; s = item_next( s ) ) { SPhenomeTiming ps; string sPhoneme = item_feat_string( s, "name" ); sRet += sPhoneme; end_current = item_feat_float( s, "end" ); dur_current = end_current - end_last; //if ( !( nPhoneme == 0 && sPhoneme == "pau" ) ) //{ dur_sum += dur_current; //} ps.fWeight = 1; /* If its a vowel and is stressed output stress value */ if ( ( cst_streq( "+", ffeature_string( s, "ph_vc" ) ) ) && ( cst_streq( "1", ffeature_string( s, "R:SylStructure.parent.stress" ) ) ) ) { sRet += "1"; ps.fWeight = 1.3; } sRet += " "; if ( pStream ) { // fade into each other ps.sName = sPhoneme; ps.fStart = end_current - dur_current; ps.fEnd = end_current; ps.fDuration = dur_current; ( ( CryMT::queue<SPhenomeTiming>* )pStream )->push( ps ); } end_last = end_current; ++nPhoneme; } dur = flite_process_output( u, outtype, FALSE ); delete_utterance( u ); return dur; }
float flite_file_to_speech(const char *filename, cst_voice *voice, const char *outtype) { cst_utterance *utt; cst_tokenstream *ts; const char *token; cst_item *t; cst_relation *tokrel; float durs = 0; int num_tokens; cst_wave *w; cst_breakfunc breakfunc = default_utt_break; cst_uttfunc utt_user_callback = 0; int fp; if ((ts = ts_open(filename, get_param_string(voice->features,"text_whitespace",NULL), get_param_string(voice->features,"text_singlecharsymbols",NULL), get_param_string(voice->features,"text_prepunctuation",NULL), get_param_string(voice->features,"text_postpunctuation",NULL))) == NULL) { cst_errmsg("failed to open file \"%s\" for reading\n", filename); return 1; } fp = get_param_int(voice->features,"file_start_position",0); if (fp > 0) ts_set_stream_pos(ts,fp); if (feat_present(voice->features,"utt_break")) breakfunc = val_breakfunc(feat_val(voice->features,"utt_break")); if (feat_present(voice->features,"utt_user_callback")) utt_user_callback = val_uttfunc(feat_val(voice->features,"utt_user_callback")); /* If its a file to write to, create and save an empty wave file */ /* as we are going to incrementally append to it */ if (!cst_streq(outtype,"play") && !cst_streq(outtype,"none") && !cst_streq(outtype,"stream")) { w = new_wave(); cst_wave_resize(w,0,1); cst_wave_set_sample_rate(w,16000); cst_wave_save_riff(w,outtype); /* an empty wave */ delete_wave(w); } num_tokens = 0; utt = new_utterance(); tokrel = utt_relation_create(utt, "Token"); while (!ts_eof(ts) || num_tokens > 0) { token = ts_get(ts); if ((cst_strlen(token) == 0) || (num_tokens > 500) || /* need an upper bound */ (relation_head(tokrel) && breakfunc(ts,token,tokrel))) { /* An end of utt, so synthesize it */ if (utt_user_callback) utt = (utt_user_callback)(utt); if (utt) { utt = flite_do_synth(utt,voice,utt_synth_tokens); durs += flite_process_output(utt,outtype,TRUE); delete_utterance(utt); utt = NULL; } else break; if (ts_eof(ts)) break; utt = new_utterance(); tokrel = utt_relation_create(utt, "Token"); num_tokens = 0; } num_tokens++; t = relation_append(tokrel, NULL); item_set_string(t,"name",token); item_set_string(t,"whitespace",ts->whitespace); item_set_string(t,"prepunctuation",ts->prepunctuation); item_set_string(t,"punc",ts->postpunctuation); /* Mark it at the beginning of the token */ item_set_int(t,"file_pos", ts->file_pos-(1+ /* as we are already on the next char */ cst_strlen(token)+ cst_strlen(ts->prepunctuation)+ cst_strlen(ts->postpunctuation))); item_set_int(t,"line_number",ts->line_number); } delete_utterance(utt); ts_close(ts); return durs; }
float flite_file_to_speech(const char *filename, cst_voice *voice, const char *outtype) { cst_utterance *utt; cst_tokenstream *ts; const char *token; cst_item *t; cst_relation *tokrel; float d, durs = 0; int num_tokens; cst_breakfunc breakfunc = default_utt_break; if ((ts = ts_open(filename, get_param_string(voice->features,"text_whitespace",NULL), get_param_string(voice->features,"text_singlecharsymbols",NULL), get_param_string(voice->features,"text_prepunctuation",NULL), get_param_string(voice->features,"text_postpunctuation",NULL))) == NULL) { cst_errmsg("failed to open file \"%s\" for reading\n", filename); return 1; } if (feat_present(voice->features,"utt_break")) breakfunc = val_breakfunc(feat_val(voice->features,"utt_break")); /* If its a file to write to delete it as we're going to */ /* incrementally append to it */ if (!cst_streq(outtype,"play") && !cst_streq(outtype,"none")) { cst_wave *w; w = new_wave(); cst_wave_resize(w,0,1); cst_wave_set_sample_rate(w,16000); cst_wave_save_riff(w,outtype); /* an empty wave */ delete_wave(w); } num_tokens = 0; utt = new_utterance(); tokrel = utt_relation_create(utt, "Token"); while (!ts_eof(ts) || num_tokens > 0) { token = ts_get(ts); if ((strlen(token) == 0) || (num_tokens > 500) || /* need an upper bound */ (relation_head(tokrel) && breakfunc(ts,token,tokrel))) { /* An end of utt */ d = flite_tokens_to_speech(utt,voice,outtype); utt = NULL; if (d < 0) goto out; durs += d; if (ts_eof(ts)) goto out; utt = new_utterance(); tokrel = utt_relation_create(utt, "Token"); num_tokens = 0; } num_tokens++; t = relation_append(tokrel, NULL); item_set_string(t,"name",token); item_set_string(t,"whitespace",ts->whitespace); item_set_string(t,"prepunctuation",ts->prepunctuation); item_set_string(t,"punc",ts->postpunctuation); item_set_int(t,"file_pos",ts->file_pos); item_set_int(t,"line_number",ts->line_number); } out: delete_utterance(utt); ts_close(ts); return durs; }
static float flite_ssml_to_speech_ts(cst_tokenstream *ts, cst_voice *voice, const char *outtype) { cst_features *ssml_feats, *ssml_word_feats; cst_features *attributes; const char *token; char *tag; cst_utterance *utt; cst_relation *tokrel; int num_tokens; cst_breakfunc breakfunc = default_utt_break; cst_uttfunc utt_user_callback = 0; float durs = 0.0; cst_item *t; ssml_feats = new_features(); ssml_word_feats = new_features(); set_charclasses(ts, " \t\n\r", ssml_singlecharsymbols_general, get_param_string(voice->features,"text_prepunctuation",""), get_param_string(voice->features,"text_postpunctuation","") ); if (feat_present(voice->features,"utt_break")) breakfunc = val_breakfunc(feat_val(voice->features,"utt_break")); if (feat_present(voice->features,"utt_user_callback")) utt_user_callback = val_uttfunc(feat_val(voice->features,"utt_user_callback")); num_tokens = 0; utt = new_utterance(); tokrel = utt_relation_create(utt, "Token"); while (!ts_eof(ts) || num_tokens > 0) { token = ts_get(ts); if (cst_streq("<",token)) { /* A tag */ tag = cst_upcase(ts_get(ts)); if (cst_streq("/",tag)) /* an end tag */ { tag = cst_upcase(ts_get(ts)); attributes = ssml_get_attributes(ts); feat_set_string(attributes,"_type","end"); } else attributes = ssml_get_attributes(ts); utt = ssml_apply_tag(tag,attributes,utt,ssml_word_feats); cst_free(tag); } else if (cst_streq("&",token)) { /* an escape sequence */ /* skip to ; and insert value in rawdata */ } else { if ((cst_strlen(token) == 0) || (num_tokens > 500) || /* need an upper bound */ (relation_head(tokrel) && breakfunc(ts,token,tokrel))) { /* An end of utt, so synthesize it */ if (utt_user_callback) utt = (utt_user_callback)(utt); if (utt) { utt = flite_do_synth(utt,voice,utt_synth_tokens); durs += flite_process_output(utt,outtype,TRUE); delete_utterance(utt); utt = NULL; } else break; if (ts_eof(ts)) break; utt = new_utterance(); tokrel = utt_relation_create(utt, "Token"); num_tokens = 0; } num_tokens++; t = relation_append(tokrel, NULL); item_set_string(t,"name",token); item_set_string(t,"whitespace",ts->whitespace); item_set_string(t,"prepunctuation",ts->prepunctuation); item_set_string(t,"punc",ts->postpunctuation); /* Mark it at the beginning of the token */ item_set_int(t,"file_pos", ts->file_pos-(1+ /* as we are already on the next char */ cst_strlen(token)+ cst_strlen(ts->prepunctuation)+ cst_strlen(ts->postpunctuation))); item_set_int(t,"line_number",ts->line_number); } } delete_utterance(utt); return durs; }