cst_wave *flite_text_to_wave(const char *text, cst_voice *voice) { cst_utterance *u; cst_wave *w; if ((u = flite_synth_text(text,voice)) == NULL) return NULL; w = copy_wave(utt_wave(u)); delete_utterance(u); return w; }
/* Flite_HTS_Engine_synthesize: synthesize speech */ HTS_Boolean Flite_HTS_Engine_synthesize(Flite_HTS_Engine * f, const char *txt, const char *wav) { int i; FILE *fp; cst_voice *v = NULL; cst_utterance *u = NULL; cst_item *s = NULL; char **label_data = NULL; int label_size = 0; if (txt == NULL) return FALSE; /* text analysis part */ v = REGISTER_VOX(NULL); if (v == NULL) return FALSE; u = flite_synth_text(txt, v); if (u == NULL) { UNREGISTER_VOX(v); return FALSE; } for (s = relation_head(utt_relation(u, "Segment")); s; s = item_next(s)) label_size++; if (label_size <= 0) { delete_utterance(u); UNREGISTER_VOX(v); return FALSE; } label_data = (char **) calloc(label_size, sizeof(char *)); for (i = 0, s = relation_head(utt_relation(u, "Segment")); s; s = item_next(s), i++) { label_data[i] = (char *) calloc(MAXBUFLEN, sizeof(char)); Flite_HTS_Engine_create_label(f, s, label_data[i]); } /* speech synthesis part */ HTS_Engine_synthesize_from_strings(&f->engine, label_data, label_size); if (wav != NULL) { fp = fopen(wav, "wb"); HTS_Engine_save_riff(&f->engine, fp); fclose(fp); } HTS_Engine_refresh(&f->engine); for (i = 0; i < label_size; i++) free(label_data[i]); free(label_data); delete_utterance(u); UNREGISTER_VOX(v); return TRUE; }
float flite_text_to_speech(const char *text, cst_voice *voice, const char *outtype) { cst_utterance *u; float dur; u = flite_synth_text(text,voice); dur = flite_process_output(u,outtype,FALSE); delete_utterance(u); return dur; }
/* Flite_Text_Analyzer_analysis: text analysis */ void Flite_Text_Analyzer_analysis(Flite_Text_Analyzer * analyzer, const char *text) { int i; cst_item *s; Flite_Utterance *fu; if (analyzer == NULL || text == NULL) return; if (analyzer->pointer != NULL) Flite_Text_Analyzer_clear(analyzer); /* allocate */ fu = (Flite_Utterance *) malloc(sizeof(Flite_Utterance)); /* create voice */ fu->v = REGISTER_VOX(NULL); if (fu->v == NULL) { free(fu); return; } /* create utterance */ fu->u = flite_synth_text(text, fu->v); if (fu->u == NULL) { UNREGISTER_VOX(fu->v); free(fu); return; } /* count number of phonemes */ for (fu->nitem = 0, s = relation_head(utt_relation(fu->u, "Segment")); s; s = item_next(s), fu->nitem++); if (fu->nitem == 0) { delete_utterance(fu->u); UNREGISTER_VOX(fu->v); free(fu); return; } /* save informations */ fu->items = (cst_item **) malloc(sizeof(cst_item *) * fu->nitem); for (i = 0, s = relation_head(utt_relation(fu->u, "Segment")); s; s = item_next(s), i++) fu->items[i] = s; analyzer->pointer = (void *) fu; }
/* Flite_HTS_Engine_synthesis: speech synthesis */ void Flite_HTS_Engine_synthesis(Flite_HTS_Engine * f, char *txt, FILE * wavfp) { int i; cst_voice *v = NULL; cst_utterance *u = NULL; cst_item *s = NULL; char **label_data = NULL; int label_size = 0; /* text analysis part */ v = REGISTER_VOX(NULL); if (v == NULL) return; u = flite_synth_text(txt, v); if (u == NULL) return; for (s = relation_head(utt_relation(u, "Segment")); s; s = item_next(s)) label_size++; if (label_size <= 0) return; label_data = (char **) calloc(label_size, sizeof(char *)); for (i = 0, s = relation_head(utt_relation(u, "Segment")); s; s = item_next(s), i++) { label_data[i] = (char *) calloc(MAXBUFLEN, sizeof(char)); Flite_HTS_Engine_create_label(f, s, label_data[i]); } /* speech synthesis part */ HTS_Engine_load_label_from_string_list(&f->engine, label_data, label_size); HTS_Engine_create_sstream(&f->engine); HTS_Engine_create_pstream(&f->engine); HTS_Engine_create_gstream(&f->engine); if (wavfp != NULL) HTS_Engine_save_riff(&f->engine, wavfp); HTS_Engine_refresh(&f->engine); for (i = 0; i < label_size; i++) free(label_data[i]); free(label_data); delete_utterance(u); UNREGISTER_VOX(v); }
string getPhonemes( const char* sText ) { string sRet; cst_features* args = new_features(); cst_voice* v; cst_utterance* u; cst_item* s; const char* name; //const cst_val* d; flite_init(); v = register_cmu_us_no_wave( NULL ); u = flite_synth_text( sText, v ); for ( s = relation_head( utt_relation( u, "Segment" ) ); s; s = item_next( s ) ) { sRet += item_feat_string( s, "name" ); float test = item_feat_float( s, "end" ); //d = segment_duration( s ); /* If its a vowel and is stressed output stress value */ if ( ( cst_streq( "+", ffeature_string( s, "ph_vc" ) ) ) && ( cst_streq( "1", ffeature_string( s, "R:SylStructure.parent.stress" ) ) ) ) { sRet += "1"; } sRet += " "; } delete_utterance( u ); delete_features( args ); return sRet; }
float flite_text_to_speech(const char *text, cst_voice *voice, const char *outtype) { cst_utterance *u; cst_wave *w; float durs; u = flite_synth_text(text,voice); if (u == NULL) return -1; w = utt_wave(u); durs = (float)w->num_samples/(float)w->sample_rate; if (cst_streq(outtype,"play")) play_wave(w); else if (!cst_streq(outtype,"none")) cst_wave_save_riff(w,outtype); delete_utterance(u); return durs; }
static void AppEventLoop(void) { Err error; EventType event; do { /* we should wait for 100ms only so we can server the audio */ EvtGetEvent(&event, evtWaitForever); if (SysHandleEvent(&event)) continue; if (MenuHandleEvent(0, &event, &error)) continue; if (AppHandleEvent(&event)) continue; FrmDispatchEvent(&event); /* Serve playing */ if (FlopPlay == true) { if (!flite->samples) { /* Got stuff to play and we're not currently playing */ flite->type = FlopOutputType; /* words, phones, wave, stream */ flite->text = input; /* text to be synthesized */ flite_synth_text(flite); /* do the synthesis */ flite->WavePosition = flite->start; flite->start += flite->utt_length; /* update position */ /* highlight the area being spoken */ if (flite->type != FliteOutputTypeWave) FlopPlay = false; /* we don't do async play on words/ph */ } if (playdata.samples && !playdata.active) /* stop and tidy up anything that's finished playing */ StopPlayStream(); else if (flite->samples && !playdata.active) { /* create a new stream and start it playing */ flite->PlayPosition = flite->WavePosition; SetupPlayStream(flite); SndStreamStart(playstream); } else if (!playdata.active) /* go into stop mode as there is nothing more to play */ FlopPlay = false; /* nothing more to play */ } else { if (flite && flite->output) StrPrintF(flite->output,"stopped %d",flite->PlayPosition); StopPlayStream(); /* flush any other waveform waiting to play */ if (flite->samples) { MemPtrFree(flite->samples); flite->num_samples = 0; flite->samples = 0; } } if (flite && flite->output) { /* should be within if above, but for debugging ... */ /* Update the output field with any new information */ SetField(FlopForm, FlopOutput, flite->output); FrmDrawForm(FrmGetFormPtr(FlopForm)); } } while ((FlopStop==false) && (event.eType != appStopEvent)); return; }
float flite_text_to_speech_phenome( const char* text, cst_voice* voice, const char* outtype, void* pStream ) { cst_utterance* u; float dur; float end_last = 0; float end_current = 0; float dur_current = 0; float dur_sum = 0; //feat_set_float( voice->features, "duration_stretch", 1 ); u = flite_synth_text( text, voice ); cst_item* s; string sRet; int nPhoneme = 0; for ( s = relation_head( utt_relation( u, "Segment" ) ); s; s = item_next( s ) ) { SPhenomeTiming ps; string sPhoneme = item_feat_string( s, "name" ); sRet += sPhoneme; end_current = item_feat_float( s, "end" ); dur_current = end_current - end_last; //if ( !( nPhoneme == 0 && sPhoneme == "pau" ) ) //{ dur_sum += dur_current; //} ps.fWeight = 1; /* If its a vowel and is stressed output stress value */ if ( ( cst_streq( "+", ffeature_string( s, "ph_vc" ) ) ) && ( cst_streq( "1", ffeature_string( s, "R:SylStructure.parent.stress" ) ) ) ) { sRet += "1"; ps.fWeight = 1.3; } sRet += " "; if ( pStream ) { // fade into each other ps.sName = sPhoneme; ps.fStart = end_current - dur_current; ps.fEnd = end_current; ps.fDuration = dur_current; ( ( CryMT::queue<SPhenomeTiming>* )pStream )->push( ps ); } end_last = end_current; ++nPhoneme; } dur = flite_process_output( u, outtype, FALSE ); delete_utterance( u ); return dur; }
int main(int argc, char *argv[]) { char *s,*fn; cst_voice *voice; // synthesis voice cst_utterance *utt; // current utterance cst_wave *cstwave; // synthesised wave Wave w; // HTK wave short *p; HTime sampPeriod = 625.0; int n; MemHeap mem; AudioOut a; try { if (InitHTK(argc,argv,version)<SUCCESS){ ReportErrors("Main",0); exit(-1); } if (NumArgs() !=2) { printf("SFliteTest synthstring file\n"); exit(0); } CreateHeap(&mem,"heap",MSTAK,1,0.0,10000,100000); s = GetStrArg(); fn = GetStrArg(); printf("Synth: %s -> %s\n",s,fn); // initialise Edinburgh cst lib cst_regex_init(); // setup the voice voice = register_cmu_us_kal16(NULL); // convert text to waveform utt = flite_synth_text(s,voice); if (utt==NULL) { HRError(12001,"SFliteTest: cant synthesise %s\n",s); throw ATK_Error(12001); } cstwave = utt_wave(utt); p = cstwave->samples; n = cstwave->num_samples; w = OpenWaveOutput(&mem,&sampPeriod,n); printf("%d samples created\n",n); PutWaveSample(w,n,p); if (CloseWaveOutput(w,WAV,fn)<SUCCESS){ ReportErrors("Main",0); exit(-1); } // explore structure const cst_item *it, *itlast = NULL; float x,y; int i; string lastword="0"; x = 0; for (i=1,it = relation_head(utt_relation(utt, "Segment")); it!=NULL; it = item_next(it),i++) { printf("Segment %d\n",i); y = item_feat_float(it,"end"); string ph = string(ffeature_string(it,"p.name")); string wd = string(ffeature_string(it,"R:SylStructure.parent.parent.name")); //printf("end = %f ph=%s wd=%s\n",y,ph.c_str(),wd.c_str()); if (wd != lastword){ printf("**** end of %s = %f\n",lastword.c_str(),x); lastword=wd; } x = y; } //if (itlast!=NULL) { // word = string(ffeature_string(itlast,"R:SylStructure.parent.parent.name")); // idx = text.find(word); //} return 0; } catch (ATK_Error e){ ReportErrors("ATK",e.i); } catch (HTK_Error e){ ReportErrors("HTK",e.i); } return 0; }