cst_utterance *concat_units(cst_utterance *utt) { cst_lpcres *target_lpcres; cst_item *u; int pm_i; int unit_size, unit_start, unit_end; int rpos, nearest_u_pm; int target_end, target_start; float m, u_index; cst_sts_list *sts_list; const char *residual_type; sts_list = val_sts_list(utt_feat_val(utt, "sts_list")); if (sts_list->codec == NULL) residual_type = "ulaw"; else residual_type = sts_list->codec; target_lpcres = val_lpcres(utt_feat_val(utt, "target_lpcres")); target_lpcres->lpc_min = sts_list->coeff_min; target_lpcres->lpc_range = sts_list->coeff_range; target_lpcres->num_channels = sts_list->num_channels; target_lpcres->sample_rate = sts_list->sample_rate; lpcres_resize_samples(target_lpcres, target_lpcres->times[target_lpcres->num_frames - 1]); if (utt_feat_val(utt, "delayed_decoding")) { target_lpcres->delayed_decoding = 1; target_lpcres->packed_residuals = cst_alloc(const unsigned char *, target_lpcres->num_frames); }
static cst_utterance *cg_resynth(cst_utterance *utt) { cst_cg_db *cg_db; cst_wave *w; cst_track *param_track; cst_track *str_track = NULL; cst_track *smoothed_track; const cst_val *streaming_info_val; cst_audio_streaming_info *asi = NULL; streaming_info_val=get_param_val(utt->features,"streaming_info",NULL); if (streaming_info_val) asi = val_audio_streaming_info(streaming_info_val); cg_db = val_cg_db(utt_feat_val(utt,"cg_db")); param_track = val_track(utt_feat_val(utt,"param_track")); if (cg_db->mixed_excitation) str_track = val_track(utt_feat_val(utt,"str_track")); if (cg_db->do_mlpg) { smoothed_track = mlpg(param_track, cg_db); w = mlsa_resynthesis(smoothed_track,str_track,cg_db,asi); delete_track(smoothed_track); } else w=mlsa_resynthesis(param_track,str_track,cg_db,asi); utt_set_wave(utt,w); return utt; }
cst_utterance *cst_spamf0(cst_utterance *utt) { cst_track *spamf0_track = NULL; cst_track *param_track = NULL; cst_item *s; cst_cg_db *cg_db; const cst_cart *acc_tree, *phrase_tree; float end, f0val, syldur; int num_frames, f, i; cg_db = val_cg_db(utt_feat_val(utt, "cg_db")); spamf0_track = new_track(); cst_track_resize(spamf0_track, (utt_feat_int(utt, "param_track_num_frames")), 1); acc_tree = cg_db->spamf0_accent_tree; phrase_tree = cg_db->spamf0_phrase_tree; end = 0.0; num_frames = 0; for (s = utt_rel_head(utt, "Segment"); s; s = item_next(s)) { end = ffeature_float(s, "end"); if (cst_streq("pau", ffeature_string(s, "name"))) { f0val = 0; } else { f0val = val_float(cart_interpret(s, phrase_tree)); } for (; ((num_frames * cg_db->frame_advance) <= end) && (num_frames < utt_feat_int(utt, "param_track_num_frames")); num_frames++) { spamf0_track->frames[num_frames][0] = f0val; } } for (s = utt_rel_head(utt, "Syllable"); s; s = item_next(s)) { f = val_int(cart_interpret(s, acc_tree)); syldur = ffeature_float(s, "R:SylStructure.daughtern.R:Segment.end") - ffeature_float(s, "R:SylStructure.daughter1.R:Segment.p.end"); cst_synthtilt(cg_db, ffeature_float(s, "R:SylStructure.daughter1.R:Segment.p.end"), cg_db->spamf0_accent_vectors[f][0], cg_db->spamf0_accent_vectors[f][2], syldur, cg_db->spamf0_accent_vectors[f][6], spamf0_track); } param_track = val_track(utt_feat_val(utt, "param_track")); for (i = 0; i < utt_feat_int(utt, "param_track_num_frames"); i++) { param_track->frames[i][0] = spamf0_track->frames[i][0]; } delete_track(spamf0_track); return utt; }
static cst_utterance *cg_make_hmmstates(cst_utterance *utt) { /* Build HMM state structure below the segment structure */ cst_cg_db *cg_db; cst_relation *hmmstate, *segstate; cst_item *seg, *s, *ss; const char *segname; int sp,p; cg_db = val_cg_db(utt_feat_val(utt,"cg_db")); hmmstate = utt_relation_create(utt,"HMMstate"); segstate = utt_relation_create(utt,"segstate"); for (seg = utt_rel_head(utt,"Segment"); seg; seg=item_next(seg)) { ss = relation_append(segstate,seg); segname = item_feat_string(seg,"name"); for (p=0; cg_db->phone_states[p]; p++) if (cst_streq(segname,cg_db->phone_states[p][0])) break; if (cg_db->phone_states[p] == NULL) p = 0; /* unknown phoneme */ for (sp=1; cg_db->phone_states[p][sp]; sp++) { s = relation_append(hmmstate,NULL); item_add_daughter(ss,s); item_set_string(s,"name",cg_db->phone_states[p][sp]); item_set_int(s,"statepos",sp); } } return utt; }
cst_utterance *join_units_simple(cst_utterance *utt) { cst_wave *w = 0; cst_lpcres *lpcres; const char *resynth_type; const cst_val *streaming_info_val; resynth_type = get_param_string(utt->features, "resynth_type", "fixed"); asis_to_pm(utt); concat_units(utt); lpcres = val_lpcres(utt_feat_val(utt, "target_lpcres")); streaming_info_val = get_param_val(utt->features, "streaming_info", NULL); if (streaming_info_val) { lpcres->asi = val_audio_streaming_info(streaming_info_val); lpcres->asi->utt = utt; } if (cst_streq(resynth_type, "fixed")) w = lpc_resynth_fixedpoint(lpcres); else { cst_errmsg("unknown resynthesis type %s\n", resynth_type); cst_error(); /* Should not happen */ } utt_set_wave(utt, w); return utt; }
cst_utterance *f0_targets_to_pm(cst_utterance *utt) { cst_item *t; float pos,lpos,f0,lf0,m; double time; int pm; cst_sts_list *sts_list; cst_lpcres *target_lpcres; sts_list = val_sts_list(utt_feat_val(utt,"sts_list")); lpos = 0; lf0 = 120; /* hmm */ pm = 0; time = 0; /* First pass to count how many pms will be required */ for (t=relation_head(utt_relation(utt,"Target")); t; t=item_next(t), lf0 = f0, lpos = pos) /* changed by dhopkins */ { pos = item_feat_float(t,"pos"); f0 = item_feat_float(t,"f0"); if (time == pos) continue; m = (f0-lf0)/(pos-lpos); for ( ; time < pos; pm++) { time += 1/(lf0 + ((time-lpos)*m)); } } target_lpcres = new_lpcres(); lpcres_resize_frames(target_lpcres,pm); lpos = 0; lf0 = 120; pm = 0; time = 0; /* Second pass puts the values in */ for (t=relation_head(utt_relation(utt,"Target")); t; t=item_next(t), lf0 = f0, lpos = pos) /* changed by dhopkins */ { pos = item_feat_float(t,"pos"); f0 = item_feat_float(t,"f0"); if (time == pos) continue; m = (f0-lf0)/(pos-lpos); for ( ; time < pos; pm++) { time += 1/(lf0 + ((time-lpos)*m)); target_lpcres->times[pm] = sts_list->sample_rate * time; } } utt_set_feat(utt,"target_lpcres",lpcres_val(target_lpcres)); return utt; }
cst_utterance *cg_synth(cst_utterance *utt) { cst_cg_db *cg_db; cg_db = val_cg_db(utt_feat_val(utt,"cg_db")); cg_make_hmmstates(utt); cg_make_params(utt); cg_predict_params(utt); if (cg_db->spamf0) { cst_spamf0(utt); } cg_resynth(utt); return utt; }
static cst_utterance *cg_make_params(cst_utterance *utt) { /* puts in the frame items */ /* historically called "mcep" but can actually be any random vectors */ cst_cg_db *cg_db; cst_relation *mcep, *mcep_link; cst_item *s, *mcep_parent, *mcep_frame; int num_frames; float start, end; float dur_stretch, tok_stretch; cg_db = val_cg_db(utt_feat_val(utt,"cg_db")); mcep = utt_relation_create(utt,"mcep"); mcep_link = utt_relation_create(utt,"mcep_link"); end = 0.0; num_frames = 0; dur_stretch = get_param_float(utt->features,"duration_stretch", 1.0); for (s = utt_rel_head(utt,"HMMstate"); s; s=item_next(s)) { start = end; tok_stretch = ffeature_float(s,"R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.local_duration_stretch"); if (tok_stretch == 0) tok_stretch = 1.0; end = start + (tok_stretch*dur_stretch*cg_state_duration(s,cg_db)); item_set_float(s,"end",end); mcep_parent = relation_append(mcep_link, s); for ( ; (num_frames * cg_db->frame_advance) <= end; num_frames++ ) { mcep_frame = relation_append(mcep,NULL); item_add_daughter(mcep_parent,mcep_frame); item_set_int(mcep_frame,"frame_number",num_frames); item_set(mcep_frame,"name",item_feat(mcep_parent,"name")); } } /* Copy duration up onto Segment relation */ for (s = utt_rel_head(utt,"Segment"); s; s=item_next(s)) item_set(s,"end",ffeature(s,"R:segstate.daughtern.end")); utt_set_feat_int(utt,"param_track_num_frames",num_frames); return utt; }
cst_utterance *join_units_modified_lpc(cst_utterance *utt) { cst_wave *w = 0; cst_lpcres *lpcres; const char *resynth_type; const cst_val *streaming_info_val; resynth_type = get_param_string(utt->features, "resynth_type", "float"); f0_targets_to_pm(utt); concat_units(utt); lpcres = val_lpcres(utt_feat_val(utt, "target_lpcres")); streaming_info_val = get_param_val(utt->features, "streaming_info", NULL); if (streaming_info_val) { lpcres->asi = val_audio_streaming_info(streaming_info_val); lpcres->asi->utt = utt; } if (cst_streq(resynth_type, "float")) w = lpc_resynth(lpcres); else if (cst_streq(resynth_type, "fixed")) { w = lpc_resynth_fixedpoint(lpcres); } else { cst_errmsg("unknown resynthesis type %s\n", resynth_type); cst_error(); /* Should not happen */ } if (w == NULL) { /* Synthesis Failed, probably because it was interrupted */ utt_set_feat_int(utt, "Interrupted", 1); w = new_wave(); } utt_set_wave(utt, w); return utt; }
cst_utterance *asis_to_pm(cst_utterance *utt) { /* Copy the PM structure from the units unchanged */ cst_item *u; cst_lpcres *target_lpcres; int unit_start, unit_end; int utt_pms, utt_size, i; cst_sts_list *sts_list; sts_list = val_sts_list(utt_feat_val(utt,"sts_list")); target_lpcres = new_lpcres(); /* Pass one to find the size */ utt_pms = utt_size = 0; for (u=relation_head(utt_relation(utt,"Unit")); u; u=item_next(u)) { unit_start = item_feat_int(u,"unit_start"); unit_end = item_feat_int(u,"unit_end"); utt_size += get_unit_size(sts_list,unit_start,unit_end); utt_pms += unit_end - unit_start; item_set_int(u,"target_end",utt_size); } lpcres_resize_frames(target_lpcres,utt_pms); /* Pass two to fill in the values */ utt_pms = utt_size = 0; for (u=relation_head(utt_relation(utt,"Unit")); u; u=item_next(u)) { unit_start = item_feat_int(u,"unit_start"); unit_end = item_feat_int(u,"unit_end"); for (i=unit_start; i<unit_end; i++,utt_pms++) { utt_size += get_frame_size(sts_list, i); target_lpcres->times[utt_pms] = utt_size; } } utt_set_feat(utt,"target_lpcres",lpcres_val(target_lpcres)); return utt; }
static cst_utterance *tokentosegs(cst_utterance *u) { cst_item *t; cst_relation *seg, *syl, *sylstructure, *word; cst_item *sylitem, *sylstructureitem, *worditem, *sssyl; cst_phoneset *ps; ps = val_phoneset(utt_feat_val(u, "phoneset")); /* Just copy tokens into the Segment relation */ seg = utt_relation_create(u, "Segment"); syl = utt_relation_create(u, "Syllable"); word = utt_relation_create(u, "Word"); sylstructure = utt_relation_create(u, "SylStructure"); sssyl = sylitem = worditem = sylstructureitem = 0; for (t = relation_head(utt_relation(u, "Token")); t; t = item_next(t)) { cst_item *segitem = relation_append(seg, NULL); char const *pname = item_feat_string(t, "name"); char *name = cst_strdup(pname); if (worditem == 0) { worditem = relation_append(word,NULL); item_set_string(worditem, "name", "phonestring"); sylstructureitem = relation_append(sylstructure,worditem); } if (sylitem == 0) { sylitem = relation_append(syl,NULL); sssyl = item_add_daughter(sylstructureitem,sylitem); } if (name[cst_strlen(name)-1] == '1') { item_set_string(sssyl,"stress","1"); name[cst_strlen(name)-1] = '\0'; } else if (name[cst_strlen(name)-1] == '0') { item_set_string(sssyl,"stress","0"); name[cst_strlen(name)-1] = '\0'; } if (cst_streq(name,"-")) { sylitem = 0; /* syllable break */ } else if (phone_id(ps, name) == -1) { cst_errmsg("Phone `%s' not in phoneset\n", pname); cst_error(); } else { item_add_daughter(sssyl,segitem); item_set_string(segitem, "name", name); } cst_free(name); } return u; }
static cst_utterance *cg_predict_params(cst_utterance *utt) { cst_cg_db *cg_db; cst_track *param_track; cst_track *str_track = NULL; cst_item *mcep; const cst_cart *mcep_tree, *f0_tree; int i,j,f,p,fd,o; const char *mname; float f0_val; int fff; int extra_feats = 0; cg_db = val_cg_db(utt_feat_val(utt,"cg_db")); param_track = new_track(); if (cg_db->do_mlpg) /* which should be the default */ fff = 1; /* copy details with stddevs */ else fff = 2; /* copy details without stddevs */ extra_feats = 1; /* voicing */ if (cg_db->mixed_excitation) { extra_feats += 5; str_track = new_track(); cst_track_resize(str_track, utt_feat_int(utt,"param_track_num_frames"), 5); } cst_track_resize(param_track, utt_feat_int(utt,"param_track_num_frames"), (cg_db->num_channels0/fff)- (2 * extra_feats));/* no voicing or str */ for (i=0,mcep=utt_rel_head(utt,"mcep"); mcep; i++,mcep=item_next(mcep)) { mname = item_feat_string(mcep,"name"); for (p=0; cg_db->types[p]; p++) if (cst_streq(mname,cg_db->types[p])) break; if (cg_db->types[0] == NULL) p=0; /* if there isn't a matching tree, use the first one */ /* Predict F0 */ f0_tree = cg_db->f0_trees[p]; f0_val = val_float(cart_interpret(mcep,f0_tree)); param_track->frames[i][0] = f0_val; /* what about stddev ? */ if (cg_db->multimodel) { /* MULTI model */ f = val_int(cart_interpret(mcep,cg_db->param_trees0[p])); fd = val_int(cart_interpret(mcep,cg_db->param_trees1[p])); item_set_int(mcep,"clustergen_param_frame",f); param_track->frames[i][0] = (param_track->frames[i][0]+ CG_MODEL_VECTOR(cg_db,model_vectors0,f,0)+ CG_MODEL_VECTOR(cg_db,model_vectors1,fd,0))/3.0; for (j=2; j<param_track->num_channels; j++) param_track->frames[i][j] = (CG_MODEL_VECTOR(cg_db,model_vectors0,f,(j)*fff)+ CG_MODEL_VECTOR(cg_db,model_vectors1,fd,(j)*fff))/2.0; if (cg_db->mixed_excitation) { o = j; for (j=0; j<5; j++) { str_track->frames[i][j] = (CG_MODEL_VECTOR(cg_db,model_vectors0,f,(o+(2*j))*fff)+ CG_MODEL_VECTOR(cg_db,model_vectors1,fd,(o+(2*j))*fff))/2.0; } } } else { /* SINGLE model */ /* Predict Spectral */ mcep_tree = cg_db->param_trees0[p]; f = val_int(cart_interpret(mcep,mcep_tree)); item_set_int(mcep,"clustergen_param_frame",f); param_track->frames[i][0] = (param_track->frames[i][0]+ CG_MODEL_VECTOR(cg_db,model_vectors0,f,0))/2.0; for (j=2; j<param_track->num_channels; j++) param_track->frames[i][j] = CG_MODEL_VECTOR(cg_db,model_vectors0,f,(j)*fff); if (cg_db->mixed_excitation) { o = j; for (j=0; j<5; j++) { str_track->frames[i][j] = CG_MODEL_VECTOR(cg_db,model_vectors0,f,(o+(2*j))*fff); } } } /* last coefficient is average voicing for cluster */ item_set_float(mcep,"voicing", CG_MODEL_VECTOR(cg_db,model_vectors0,f, cg_db->num_channels0-2)); param_track->times[i] = i * cg_db->frame_advance; } cg_smooth_F0(utt,cg_db,param_track); utt_set_feat(utt,"param_track",track_val(param_track)); if (cg_db->mixed_excitation) utt_set_feat(utt,"str_track",track_val(str_track)); return utt; }