/* Flite_HTS_Engine_synthesize: synthesize speech */ HTS_Boolean Flite_HTS_Engine_synthesize(Flite_HTS_Engine * f, const char *txt, const char *wav) { int i; FILE *fp; cst_voice *v = NULL; cst_utterance *u = NULL; cst_item *s = NULL; char **label_data = NULL; int label_size = 0; if (txt == NULL) return FALSE; /* text analysis part */ v = REGISTER_VOX(NULL); if (v == NULL) return FALSE; u = flite_synth_text(txt, v); if (u == NULL) { UNREGISTER_VOX(v); return FALSE; } for (s = relation_head(utt_relation(u, "Segment")); s; s = item_next(s)) label_size++; if (label_size <= 0) { delete_utterance(u); UNREGISTER_VOX(v); return FALSE; } label_data = (char **) calloc(label_size, sizeof(char *)); for (i = 0, s = relation_head(utt_relation(u, "Segment")); s; s = item_next(s), i++) { label_data[i] = (char *) calloc(MAXBUFLEN, sizeof(char)); Flite_HTS_Engine_create_label(f, s, label_data[i]); } /* speech synthesis part */ HTS_Engine_synthesize_from_strings(&f->engine, label_data, label_size); if (wav != NULL) { fp = fopen(wav, "wb"); HTS_Engine_save_riff(&f->engine, fp); fclose(fp); } HTS_Engine_refresh(&f->engine); for (i = 0; i < label_size; i++) free(label_data[i]); free(label_data); delete_utterance(u); UNREGISTER_VOX(v); return TRUE; }
/* Flite_Text_Analyzer_analysis: text analysis */ void Flite_Text_Analyzer_analysis(Flite_Text_Analyzer * analyzer, const char *text) { int i; cst_item *s; Flite_Utterance *fu; if (analyzer == NULL || text == NULL) return; if (analyzer->pointer != NULL) Flite_Text_Analyzer_clear(analyzer); /* allocate */ fu = (Flite_Utterance *) malloc(sizeof(Flite_Utterance)); /* create voice */ fu->v = REGISTER_VOX(NULL); if (fu->v == NULL) { free(fu); return; } /* create utterance */ fu->u = flite_synth_text(text, fu->v); if (fu->u == NULL) { UNREGISTER_VOX(fu->v); free(fu); return; } /* count number of phonemes */ for (fu->nitem = 0, s = relation_head(utt_relation(fu->u, "Segment")); s; s = item_next(s), fu->nitem++); if (fu->nitem == 0) { delete_utterance(fu->u); UNREGISTER_VOX(fu->v); free(fu); return; } /* save informations */ fu->items = (cst_item **) malloc(sizeof(cst_item *) * fu->nitem); for (i = 0, s = relation_head(utt_relation(fu->u, "Segment")); s; s = item_next(s), i++) fu->items[i] = s; analyzer->pointer = (void *) fu; }
/* Flite_HTS_Engine_synthesis: speech synthesis */ void Flite_HTS_Engine_synthesis(Flite_HTS_Engine * f, char *txt, FILE * wavfp) { int i; cst_voice *v = NULL; cst_utterance *u = NULL; cst_item *s = NULL; char **label_data = NULL; int label_size = 0; /* text analysis part */ v = REGISTER_VOX(NULL); if (v == NULL) return; u = flite_synth_text(txt, v); if (u == NULL) return; for (s = relation_head(utt_relation(u, "Segment")); s; s = item_next(s)) label_size++; if (label_size <= 0) return; label_data = (char **) calloc(label_size, sizeof(char *)); for (i = 0, s = relation_head(utt_relation(u, "Segment")); s; s = item_next(s), i++) { label_data[i] = (char *) calloc(MAXBUFLEN, sizeof(char)); Flite_HTS_Engine_create_label(f, s, label_data[i]); } /* speech synthesis part */ HTS_Engine_load_label_from_string_list(&f->engine, label_data, label_size); HTS_Engine_create_sstream(&f->engine); HTS_Engine_create_pstream(&f->engine); HTS_Engine_create_gstream(&f->engine); if (wavfp != NULL) HTS_Engine_save_riff(&f->engine, wavfp); HTS_Engine_refresh(&f->engine); for (i = 0; i < label_size; i++) free(label_data[i]); free(label_data); delete_utterance(u); UNREGISTER_VOX(v); }
int main(int argc, char **argv) { struct timeval tv; cst_voice *v; const char *filename; const char *outtype; int i; float durs; double time_start, time_end; int flite_verbose, flite_loop, flite_bench; int explicit_filename, explicit_text, explicit_phones; #define ITER_MAX 3 int bench_iter = 0; cst_features *extra_feats; filename = 0; outtype = "play"; /* default is to play */ flite_verbose = FALSE; flite_loop = FALSE; flite_bench = FALSE; explicit_text = explicit_filename = explicit_phones = FALSE; extra_feats = new_features(); flite_init(); for (i=1; i<argc; i++) { if (cst_streq(argv[i],"--version")) { flite_version(); return 1; } else if (cst_streq(argv[i],"-h") || cst_streq(argv[i],"--help") || cst_streq(argv[i],"-?")) flite_usage(); else if (cst_streq(argv[i],"-v")) flite_verbose = TRUE; else if (cst_streq(argv[i],"-l")) flite_loop = TRUE; else if (cst_streq(argv[i],"-b")) { flite_bench = TRUE; break; /* ignore other arguments */ } else if ((cst_streq(argv[i],"-o")) && (i+1 < argc)) { outtype = argv[i+1]; i++; } else if (cst_streq(argv[i],"-f") && (i+1 < argc)) { filename = argv[i+1]; explicit_filename = TRUE; i++; } else if (cst_streq(argv[i],"-pw")) { feat_set_string(extra_feats,"print_info_relation","Word"); feat_set(extra_feats,"post_synth_hook_func", uttfunc_val(&print_info)); } else if (cst_streq(argv[i],"-ps")) { feat_set_string(extra_feats,"print_info_relation","Segment"); feat_set(extra_feats,"post_synth_hook_func", uttfunc_val(&print_info)); } else if (cst_streq(argv[i],"-pr") && (i+1 < argc)) { feat_set_string(extra_feats,"print_info_relation",argv[i+1]); feat_set(extra_feats,"post_synth_hook_func", uttfunc_val(&print_info)); i++; } else if ((cst_streq(argv[i],"-set") || cst_streq(argv[i],"-s")) && (i+1 < argc)) { ef_set(extra_feats,argv[i+1],0); i++; } else if (cst_streq(argv[i],"--seti") && (i+1 < argc)) { ef_set(extra_feats,argv[i+1],"int"); i++; } else if (cst_streq(argv[i],"--setf") && (i+1 < argc)) { ef_set(extra_feats,argv[i+1],"float"); i++; } else if (cst_streq(argv[i],"--sets") && (i+1 < argc)) { ef_set(extra_feats,argv[i+1],"string"); i++; } else if (cst_streq(argv[i],"-p") && (i+1 < argc)) { filename = argv[i+1]; explicit_phones = TRUE; i++; } else if (cst_streq(argv[i],"-t") && (i+1 < argc)) { filename = argv[i+1]; explicit_text = TRUE; i++; } else if (filename) outtype = argv[i]; else filename = argv[i]; } if (filename == NULL) filename = "-"; /* stdin */ v = REGISTER_VOX(NULL); feat_copy_into(extra_feats,v->features); durs = 0.0; if (flite_bench) { outtype = "none"; filename = "A whole joy was reaping, but they've gone south, you should fetch azure mike."; explicit_text = TRUE; } loop: gettimeofday(&tv,NULL); time_start = (double)(tv.tv_sec)+(((double)tv.tv_usec)/1000000.0); if (explicit_phones) durs = flite_phones_to_speech(filename,v,outtype); else if ((strchr(filename,' ') && !explicit_filename) || explicit_text) durs = flite_text_to_speech(filename,v,outtype); else durs = flite_file_to_speech(filename,v,outtype); gettimeofday(&tv,NULL); time_end = ((double)(tv.tv_sec))+((double)tv.tv_usec/1000000.0); if (flite_verbose || (flite_bench && bench_iter == ITER_MAX)) printf("times faster than real-time: %f\n(%f seconds of speech synthesized in %f)\n", durs/(float)(time_end-time_start), durs, (float)(time_end-time_start)); if (flite_loop || (flite_bench && bench_iter++ < ITER_MAX)) goto loop; delete_features(extra_feats); UNREGISTER_VOX(v); return 0; }