cst_utterance *default_tokenization(cst_utterance *u) { const char *text,*token; cst_tokenstream *fd; cst_item *t; cst_relation *r; text = utt_input_text(u); r = utt_relation_create(u,"Token"); fd = ts_open_string(text, get_param_string(u->features,"text_whitespace",NULL), get_param_string(u->features,"text_singlecharsymbols",NULL), get_param_string(u->features,"text_prepunctuation",NULL), get_param_string(u->features,"text_postpunctuation",NULL)); while(!ts_eof(fd)) { token = ts_get(fd); if (cst_strlen(token) > 0) { t = relation_append(r,NULL); item_set_string(t,"name",token); item_set_string(t,"whitespace",fd->whitespace); item_set_string(t,"prepunctuation",fd->prepunctuation); item_set_string(t,"punc",fd->postpunctuation); item_set_int(t,"file_pos",fd->file_pos); item_set_int(t,"line_number",fd->line_number); } } ts_close(fd); return u; }
static int bbb_relation_load(cst_relation *r,const char *filename) { const char *token; cst_item *item; cst_tokenstream *fd; fd = ts_open(filename); if (fd == 0) return 0; while (!ts_eof(fd)) { token = ts_get(fd); if (cst_streq(token,"")) continue; item = relation_append(r,NULL); item_set_string(item,"name",token); item_set_string(item,"whitespace",fd->whitespace); item_set_string(item,"prepunctuation",fd->prepunctuation); item_set_string(item,"punc",fd->postpunctuation); item_set_int(item,"file_pos",fd->file_pos); item_set_int(item,"line_number",fd->line_number); } ts_close(fd); return 1; }
static cst_utterance *cg_make_hmmstates(cst_utterance *utt) { /* Build HMM state structure below the segment structure */ cst_cg_db *cg_db; cst_relation *hmmstate, *segstate; cst_item *seg, *s, *ss; const char *segname; int sp,p; cg_db = val_cg_db(utt_feat_val(utt,"cg_db")); hmmstate = utt_relation_create(utt,"HMMstate"); segstate = utt_relation_create(utt,"segstate"); for (seg = utt_rel_head(utt,"Segment"); seg; seg=item_next(seg)) { ss = relation_append(segstate,seg); segname = item_feat_string(seg,"name"); for (p=0; cg_db->phone_states[p]; p++) if (cst_streq(segname,cg_db->phone_states[p][0])) break; if (cg_db->phone_states[p] == NULL) p = 0; /* unknown phoneme */ for (sp=1; cg_db->phone_states[p][sp]; sp++) { s = relation_append(hmmstate,NULL); item_add_daughter(ss,s); item_set_string(s,"name",cg_db->phone_states[p][sp]); item_set_int(s,"statepos",sp); } } return utt; }
static cst_val* word_to_phones(const cst_item *word) { cst_val*phones=NULL; const char *name=item_feat_string(word, "name"); ustring32_t letters=ustring32_alloc(0); if(letters==NULL) return NULL; ustring32_assign8(letters,(const uint8_t*)name); if(ustring32_empty(letters)) { ustring32_free(letters); return NULL; } unsigned int flags=classify_characters(ustring32_str(letters),ustring32_length(letters)); int variant=item_feat_int(item_parent(item_as(word,"Token")),"variant"); if((flags&cs_lc)&&cst_streq(ffeature_string(word,"gpos"),"content")) { if(variant==variant_pseudo_english) phones=ustring32_lts_apply(letters,&en_consonants_lts); else phones=ustring32_lts_apply(letters,&ru_consonants_lts); item_set_int(word,"no_vr",1); } else if((variant==variant_pseudo_english)&&(flags&cs_en)) { cst_val *en_phones=lex_lookup(en_lex,name,(cst_streq(name,"a")?"n":NULL)); if(en_phones) { phones=ru_lts_apply(en_phones,&ru_en_lts); delete_val(en_phones); } item_set_int(word,"no_pl",1); } else { const ru_dict_entry *e=bsearch(name,ru_dict,ru_dict_size,sizeof(ru_dict_entry),compare_entries); if(e!=NULL) { if(e->stress > 0) ustring32_set(letters,e->stress-1,1105); else item_set_int(word,"stressed_syl_num",e->stress); } phones=ustring32_lts_apply(letters,&ru_lts); } ustring32_free(letters); return phones; }
static cst_utterance *cg_make_params(cst_utterance *utt) { /* puts in the frame items */ /* historically called "mcep" but can actually be any random vectors */ cst_cg_db *cg_db; cst_relation *mcep, *mcep_link; cst_item *s, *mcep_parent, *mcep_frame; int num_frames; float start, end; float dur_stretch, tok_stretch; cg_db = val_cg_db(utt_feat_val(utt,"cg_db")); mcep = utt_relation_create(utt,"mcep"); mcep_link = utt_relation_create(utt,"mcep_link"); end = 0.0; num_frames = 0; dur_stretch = get_param_float(utt->features,"duration_stretch", 1.0); for (s = utt_rel_head(utt,"HMMstate"); s; s=item_next(s)) { start = end; tok_stretch = ffeature_float(s,"R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.local_duration_stretch"); if (tok_stretch == 0) tok_stretch = 1.0; end = start + (tok_stretch*dur_stretch*cg_state_duration(s,cg_db)); item_set_float(s,"end",end); mcep_parent = relation_append(mcep_link, s); for ( ; (num_frames * cg_db->frame_advance) <= end; num_frames++ ) { mcep_frame = relation_append(mcep,NULL); item_add_daughter(mcep_parent,mcep_frame); item_set_int(mcep_frame,"frame_number",num_frames); item_set(mcep_frame,"name",item_feat(mcep_parent,"name")); } } /* Copy duration up onto Segment relation */ for (s = utt_rel_head(utt,"Segment"); s; s=item_next(s)) item_set(s,"end",ffeature(s,"R:segstate.daughtern.end")); utt_set_feat_int(utt,"param_track_num_frames",num_frames); return utt; }
cst_utterance *asis_to_pm(cst_utterance *utt) { /* Copy the PM structure from the units unchanged */ cst_item *u; cst_lpcres *target_lpcres; int unit_start, unit_end; int utt_pms, utt_size, i; cst_sts_list *sts_list; sts_list = val_sts_list(utt_feat_val(utt,"sts_list")); target_lpcres = new_lpcres(); /* Pass one to find the size */ utt_pms = utt_size = 0; for (u=relation_head(utt_relation(utt,"Unit")); u; u=item_next(u)) { unit_start = item_feat_int(u,"unit_start"); unit_end = item_feat_int(u,"unit_end"); utt_size += get_unit_size(sts_list,unit_start,unit_end); utt_pms += unit_end - unit_start; item_set_int(u,"target_end",utt_size); } lpcres_resize_frames(target_lpcres,utt_pms); /* Pass two to fill in the values */ utt_pms = utt_size = 0; for (u=relation_head(utt_relation(utt,"Unit")); u; u=item_next(u)) { unit_start = item_feat_int(u,"unit_start"); unit_end = item_feat_int(u,"unit_end"); for (i=unit_start; i<unit_end; i++,utt_pms++) { utt_size += get_frame_size(sts_list, i); target_lpcres->times[utt_pms] = utt_size; } } utt_set_feat(utt,"target_lpcres",lpcres_val(target_lpcres)); return utt; }
float flite_file_to_speech(const char *filename, cst_voice *voice, const char *outtype) { cst_utterance *utt; cst_tokenstream *ts; const char *token; cst_item *t; cst_relation *tokrel; float d, durs = 0; int num_tokens; cst_breakfunc breakfunc = default_utt_break; if ((ts = ts_open(filename, get_param_string(voice->features,"text_whitespace",NULL), get_param_string(voice->features,"text_singlecharsymbols",NULL), get_param_string(voice->features,"text_prepunctuation",NULL), get_param_string(voice->features,"text_postpunctuation",NULL))) == NULL) { cst_errmsg("failed to open file \"%s\" for reading\n", filename); return 1; } if (feat_present(voice->features,"utt_break")) breakfunc = val_breakfunc(feat_val(voice->features,"utt_break")); /* If its a file to write to delete it as we're going to */ /* incrementally append to it */ if (!cst_streq(outtype,"play") && !cst_streq(outtype,"none")) { cst_wave *w; w = new_wave(); cst_wave_resize(w,0,1); cst_wave_set_sample_rate(w,16000); cst_wave_save_riff(w,outtype); /* an empty wave */ delete_wave(w); } num_tokens = 0; utt = new_utterance(); tokrel = utt_relation_create(utt, "Token"); while (!ts_eof(ts) || num_tokens > 0) { token = ts_get(ts); if ((strlen(token) == 0) || (num_tokens > 500) || /* need an upper bound */ (relation_head(tokrel) && breakfunc(ts,token,tokrel))) { /* An end of utt */ d = flite_tokens_to_speech(utt,voice,outtype); utt = NULL; if (d < 0) goto out; durs += d; if (ts_eof(ts)) goto out; utt = new_utterance(); tokrel = utt_relation_create(utt, "Token"); num_tokens = 0; } num_tokens++; t = relation_append(tokrel, NULL); item_set_string(t,"name",token); item_set_string(t,"whitespace",ts->whitespace); item_set_string(t,"prepunctuation",ts->prepunctuation); item_set_string(t,"punc",ts->postpunctuation); item_set_int(t,"file_pos",ts->file_pos); item_set_int(t,"line_number",ts->line_number); } out: delete_utterance(utt); ts_close(ts); return durs; }
static cst_utterance *cg_predict_params(cst_utterance *utt) { cst_cg_db *cg_db; cst_track *param_track; cst_track *str_track = NULL; cst_item *mcep; const cst_cart *mcep_tree, *f0_tree; int i,j,f,p,fd,o; const char *mname; float f0_val; int fff; int extra_feats = 0; cg_db = val_cg_db(utt_feat_val(utt,"cg_db")); param_track = new_track(); if (cg_db->do_mlpg) /* which should be the default */ fff = 1; /* copy details with stddevs */ else fff = 2; /* copy details without stddevs */ extra_feats = 1; /* voicing */ if (cg_db->mixed_excitation) { extra_feats += 5; str_track = new_track(); cst_track_resize(str_track, utt_feat_int(utt,"param_track_num_frames"), 5); } cst_track_resize(param_track, utt_feat_int(utt,"param_track_num_frames"), (cg_db->num_channels0/fff)- (2 * extra_feats));/* no voicing or str */ for (i=0,mcep=utt_rel_head(utt,"mcep"); mcep; i++,mcep=item_next(mcep)) { mname = item_feat_string(mcep,"name"); for (p=0; cg_db->types[p]; p++) if (cst_streq(mname,cg_db->types[p])) break; if (cg_db->types[0] == NULL) p=0; /* if there isn't a matching tree, use the first one */ /* Predict F0 */ f0_tree = cg_db->f0_trees[p]; f0_val = val_float(cart_interpret(mcep,f0_tree)); param_track->frames[i][0] = f0_val; /* what about stddev ? */ if (cg_db->multimodel) { /* MULTI model */ f = val_int(cart_interpret(mcep,cg_db->param_trees0[p])); fd = val_int(cart_interpret(mcep,cg_db->param_trees1[p])); item_set_int(mcep,"clustergen_param_frame",f); param_track->frames[i][0] = (param_track->frames[i][0]+ CG_MODEL_VECTOR(cg_db,model_vectors0,f,0)+ CG_MODEL_VECTOR(cg_db,model_vectors1,fd,0))/3.0; for (j=2; j<param_track->num_channels; j++) param_track->frames[i][j] = (CG_MODEL_VECTOR(cg_db,model_vectors0,f,(j)*fff)+ CG_MODEL_VECTOR(cg_db,model_vectors1,fd,(j)*fff))/2.0; if (cg_db->mixed_excitation) { o = j; for (j=0; j<5; j++) { str_track->frames[i][j] = (CG_MODEL_VECTOR(cg_db,model_vectors0,f,(o+(2*j))*fff)+ CG_MODEL_VECTOR(cg_db,model_vectors1,fd,(o+(2*j))*fff))/2.0; } } } else { /* SINGLE model */ /* Predict Spectral */ mcep_tree = cg_db->param_trees0[p]; f = val_int(cart_interpret(mcep,mcep_tree)); item_set_int(mcep,"clustergen_param_frame",f); param_track->frames[i][0] = (param_track->frames[i][0]+ CG_MODEL_VECTOR(cg_db,model_vectors0,f,0))/2.0; for (j=2; j<param_track->num_channels; j++) param_track->frames[i][j] = CG_MODEL_VECTOR(cg_db,model_vectors0,f,(j)*fff); if (cg_db->mixed_excitation) { o = j; for (j=0; j<5; j++) { str_track->frames[i][j] = CG_MODEL_VECTOR(cg_db,model_vectors0,f,(o+(2*j))*fff); } } } /* last coefficient is average voicing for cluster */ item_set_float(mcep,"voicing", CG_MODEL_VECTOR(cg_db,model_vectors0,f, cg_db->num_channels0-2)); param_track->times[i] = i * cg_db->frame_advance; } cg_smooth_F0(utt,cg_db,param_track); utt_set_feat(utt,"param_track",track_val(param_track)); if (cg_db->mixed_excitation) utt_set_feat(utt,"str_track",track_val(str_track)); return utt; }
static float flite_ssml_to_speech_ts(cst_tokenstream *ts, cst_voice *voice, const char *outtype) { cst_features *ssml_feats, *ssml_word_feats; cst_features *attributes; const char *token; char *tag; cst_utterance *utt; cst_relation *tokrel; int num_tokens; cst_breakfunc breakfunc = default_utt_break; cst_uttfunc utt_user_callback = 0; float durs = 0.0; cst_item *t; ssml_feats = new_features(); ssml_word_feats = new_features(); set_charclasses(ts, " \t\n\r", ssml_singlecharsymbols_general, get_param_string(voice->features,"text_prepunctuation",""), get_param_string(voice->features,"text_postpunctuation","") ); if (feat_present(voice->features,"utt_break")) breakfunc = val_breakfunc(feat_val(voice->features,"utt_break")); if (feat_present(voice->features,"utt_user_callback")) utt_user_callback = val_uttfunc(feat_val(voice->features,"utt_user_callback")); num_tokens = 0; utt = new_utterance(); tokrel = utt_relation_create(utt, "Token"); while (!ts_eof(ts) || num_tokens > 0) { token = ts_get(ts); if (cst_streq("<",token)) { /* A tag */ tag = cst_upcase(ts_get(ts)); if (cst_streq("/",tag)) /* an end tag */ { tag = cst_upcase(ts_get(ts)); attributes = ssml_get_attributes(ts); feat_set_string(attributes,"_type","end"); } else attributes = ssml_get_attributes(ts); utt = ssml_apply_tag(tag,attributes,utt,ssml_word_feats); cst_free(tag); } else if (cst_streq("&",token)) { /* an escape sequence */ /* skip to ; and insert value in rawdata */ } else { if ((cst_strlen(token) == 0) || (num_tokens > 500) || /* need an upper bound */ (relation_head(tokrel) && breakfunc(ts,token,tokrel))) { /* An end of utt, so synthesize it */ if (utt_user_callback) utt = (utt_user_callback)(utt); if (utt) { utt = flite_do_synth(utt,voice,utt_synth_tokens); durs += flite_process_output(utt,outtype,TRUE); delete_utterance(utt); utt = NULL; } else break; if (ts_eof(ts)) break; utt = new_utterance(); tokrel = utt_relation_create(utt, "Token"); num_tokens = 0; } num_tokens++; t = relation_append(tokrel, NULL); item_set_string(t,"name",token); item_set_string(t,"whitespace",ts->whitespace); item_set_string(t,"prepunctuation",ts->prepunctuation); item_set_string(t,"punc",ts->postpunctuation); /* Mark it at the beginning of the token */ item_set_int(t,"file_pos", ts->file_pos-(1+ /* as we are already on the next char */ cst_strlen(token)+ cst_strlen(ts->prepunctuation)+ cst_strlen(ts->postpunctuation))); item_set_int(t,"line_number",ts->line_number); } } delete_utterance(utt); return durs; }
float flite_file_to_speech(const char *filename, cst_voice *voice, const char *outtype) { cst_utterance *utt; cst_tokenstream *ts; const char *token; cst_item *t; cst_relation *tokrel; float durs = 0; int num_tokens; cst_wave *w; cst_breakfunc breakfunc = default_utt_break; cst_uttfunc utt_user_callback = 0; int fp; if ((ts = ts_open(filename, get_param_string(voice->features,"text_whitespace",NULL), get_param_string(voice->features,"text_singlecharsymbols",NULL), get_param_string(voice->features,"text_prepunctuation",NULL), get_param_string(voice->features,"text_postpunctuation",NULL))) == NULL) { cst_errmsg("failed to open file \"%s\" for reading\n", filename); return 1; } fp = get_param_int(voice->features,"file_start_position",0); if (fp > 0) ts_set_stream_pos(ts,fp); if (feat_present(voice->features,"utt_break")) breakfunc = val_breakfunc(feat_val(voice->features,"utt_break")); if (feat_present(voice->features,"utt_user_callback")) utt_user_callback = val_uttfunc(feat_val(voice->features,"utt_user_callback")); /* If its a file to write to, create and save an empty wave file */ /* as we are going to incrementally append to it */ if (!cst_streq(outtype,"play") && !cst_streq(outtype,"none") && !cst_streq(outtype,"stream")) { w = new_wave(); cst_wave_resize(w,0,1); cst_wave_set_sample_rate(w,16000); cst_wave_save_riff(w,outtype); /* an empty wave */ delete_wave(w); } num_tokens = 0; utt = new_utterance(); tokrel = utt_relation_create(utt, "Token"); while (!ts_eof(ts) || num_tokens > 0) { token = ts_get(ts); if ((cst_strlen(token) == 0) || (num_tokens > 500) || /* need an upper bound */ (relation_head(tokrel) && breakfunc(ts,token,tokrel))) { /* An end of utt, so synthesize it */ if (utt_user_callback) utt = (utt_user_callback)(utt); if (utt) { utt = flite_do_synth(utt,voice,utt_synth_tokens); durs += flite_process_output(utt,outtype,TRUE); delete_utterance(utt); utt = NULL; } else break; if (ts_eof(ts)) break; utt = new_utterance(); tokrel = utt_relation_create(utt, "Token"); num_tokens = 0; } num_tokens++; t = relation_append(tokrel, NULL); item_set_string(t,"name",token); item_set_string(t,"whitespace",ts->whitespace); item_set_string(t,"prepunctuation",ts->prepunctuation); item_set_string(t,"punc",ts->postpunctuation); /* Mark it at the beginning of the token */ item_set_int(t,"file_pos", ts->file_pos-(1+ /* as we are already on the next char */ cst_strlen(token)+ cst_strlen(ts->prepunctuation)+ cst_strlen(ts->postpunctuation))); item_set_int(t,"line_number",ts->line_number); } delete_utterance(utt); ts_close(ts); return durs; }