示例#1
0
cst_utterance *concat_units(cst_utterance *utt)
{
    cst_lpcres *target_lpcres;
    cst_item *u;
    int pm_i;
    int unit_size, unit_start, unit_end;
    int rpos, nearest_u_pm;
    int target_end, target_start;
    float m, u_index;
    cst_sts_list *sts_list;
    const char *residual_type;

    sts_list = val_sts_list(utt_feat_val(utt, "sts_list"));
    if (sts_list->codec == NULL)
        residual_type = "ulaw";
    else
        residual_type = sts_list->codec;
    target_lpcres = val_lpcres(utt_feat_val(utt, "target_lpcres"));

    target_lpcres->lpc_min = sts_list->coeff_min;
    target_lpcres->lpc_range = sts_list->coeff_range;
    target_lpcres->num_channels = sts_list->num_channels;
    target_lpcres->sample_rate = sts_list->sample_rate;
    lpcres_resize_samples(target_lpcres,
                          target_lpcres->times[target_lpcres->num_frames -
                                               1]);
    if (utt_feat_val(utt, "delayed_decoding"))
    {
        target_lpcres->delayed_decoding = 1;
        target_lpcres->packed_residuals =
            cst_alloc(const unsigned char *, target_lpcres->num_frames);
    }
示例#2
0
static cst_utterance *cg_resynth(cst_utterance *utt)
{
    cst_cg_db *cg_db;
    cst_wave *w;
    cst_track *param_track;
    cst_track *str_track = NULL;
    cst_track *smoothed_track;
    const cst_val *streaming_info_val;
    cst_audio_streaming_info *asi = NULL;

    streaming_info_val=get_param_val(utt->features,"streaming_info",NULL);
    if (streaming_info_val)
        asi = val_audio_streaming_info(streaming_info_val);

    cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
    param_track = val_track(utt_feat_val(utt,"param_track"));
    if (cg_db->mixed_excitation)
        str_track = val_track(utt_feat_val(utt,"str_track"));

    if (cg_db->do_mlpg)
    {
        smoothed_track = mlpg(param_track, cg_db);
        w = mlsa_resynthesis(smoothed_track,str_track,cg_db,asi);
        delete_track(smoothed_track);
    }
    else
        w=mlsa_resynthesis(param_track,str_track,cg_db,asi);


    utt_set_wave(utt,w);

    return utt;
}
示例#3
0
cst_utterance *cst_spamf0(cst_utterance *utt)
{
    cst_track *spamf0_track = NULL;
    cst_track *param_track = NULL;
    cst_item *s;
    cst_cg_db *cg_db;
    const cst_cart *acc_tree, *phrase_tree;
    float end, f0val, syldur;
    int num_frames, f, i;
    cg_db = val_cg_db(utt_feat_val(utt, "cg_db"));

    spamf0_track = new_track();
    cst_track_resize(spamf0_track,
                     (utt_feat_int(utt, "param_track_num_frames")), 1);
    acc_tree = cg_db->spamf0_accent_tree;
    phrase_tree = cg_db->spamf0_phrase_tree;
    end = 0.0;
    num_frames = 0;
    for (s = utt_rel_head(utt, "Segment"); s; s = item_next(s))
    {
        end = ffeature_float(s, "end");
        if (cst_streq("pau", ffeature_string(s, "name")))
        {
            f0val = 0;
        }
        else
        {
            f0val = val_float(cart_interpret(s, phrase_tree));
        }

        for (;
             ((num_frames * cg_db->frame_advance) <= end)
             && (num_frames < utt_feat_int(utt, "param_track_num_frames"));
             num_frames++)
        {
            spamf0_track->frames[num_frames][0] = f0val;
        }
    }

    for (s = utt_rel_head(utt, "Syllable"); s; s = item_next(s))
    {
        f = val_int(cart_interpret(s, acc_tree));
        syldur = ffeature_float(s, "R:SylStructure.daughtern.R:Segment.end")
            - ffeature_float(s, "R:SylStructure.daughter1.R:Segment.p.end");
        cst_synthtilt(cg_db,
                      ffeature_float(s,
                                     "R:SylStructure.daughter1.R:Segment.p.end"),
                      cg_db->spamf0_accent_vectors[f][0],
                      cg_db->spamf0_accent_vectors[f][2], syldur,
                      cg_db->spamf0_accent_vectors[f][6], spamf0_track);
    }
    param_track = val_track(utt_feat_val(utt, "param_track"));
    for (i = 0; i < utt_feat_int(utt, "param_track_num_frames"); i++)
    {
        param_track->frames[i][0] = spamf0_track->frames[i][0];
    }
    delete_track(spamf0_track);
    return utt;
}
示例#4
0
static cst_utterance *cg_make_hmmstates(cst_utterance *utt)
{
    /* Build HMM state structure below the segment structure */
    cst_cg_db *cg_db;
    cst_relation *hmmstate, *segstate;
    cst_item *seg, *s, *ss;
    const char *segname;
    int sp,p;

    cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
    hmmstate = utt_relation_create(utt,"HMMstate");
    segstate = utt_relation_create(utt,"segstate");

    for (seg = utt_rel_head(utt,"Segment"); seg; seg=item_next(seg))
    {
        ss = relation_append(segstate,seg);
        segname = item_feat_string(seg,"name");
        for (p=0; cg_db->phone_states[p]; p++)
            if (cst_streq(segname,cg_db->phone_states[p][0]))
                break;
        if (cg_db->phone_states[p] == NULL)
            p = 0;  /* unknown phoneme */
        for (sp=1; cg_db->phone_states[p][sp]; sp++)
        {
            s = relation_append(hmmstate,NULL);
            item_add_daughter(ss,s);
            item_set_string(s,"name",cg_db->phone_states[p][sp]);
            item_set_int(s,"statepos",sp);
        }
    }

    return utt;
}
示例#5
0
cst_utterance *join_units_simple(cst_utterance *utt)
{
    cst_wave *w = 0;
    cst_lpcres *lpcres;
    const char *resynth_type;
    const cst_val *streaming_info_val;

    resynth_type = get_param_string(utt->features, "resynth_type", "fixed");

    asis_to_pm(utt);
    concat_units(utt);

    lpcres = val_lpcres(utt_feat_val(utt, "target_lpcres"));

    streaming_info_val = get_param_val(utt->features, "streaming_info", NULL);
    if (streaming_info_val)
    {
        lpcres->asi = val_audio_streaming_info(streaming_info_val);
        lpcres->asi->utt = utt;
    }

    if (cst_streq(resynth_type, "fixed"))
        w = lpc_resynth_fixedpoint(lpcres);
    else
    {
        cst_errmsg("unknown resynthesis type %s\n", resynth_type);
        cst_error();            /* Should not happen */
    }

    utt_set_wave(utt, w);

    return utt;
}
示例#6
0
cst_utterance *f0_targets_to_pm(cst_utterance *utt)
{
    cst_item *t;
    float pos,lpos,f0,lf0,m;
    double time;
    int pm;
    cst_sts_list *sts_list;
    cst_lpcres *target_lpcres;

    sts_list = val_sts_list(utt_feat_val(utt,"sts_list"));
    lpos = 0;
    lf0 = 120; /* hmm */
    pm = 0;
    time = 0;
    /* First pass to count how many pms will be required */
    for (t=relation_head(utt_relation(utt,"Target"));
	 t;
	 t=item_next(t), lf0 = f0, lpos = pos) /* changed by dhopkins */
    {
	pos = item_feat_float(t,"pos");
	f0 = item_feat_float(t,"f0");
	if (time == pos) continue;
	m = (f0-lf0)/(pos-lpos);
	for ( ; time < pos; pm++)
	{
	    time += 1/(lf0 + ((time-lpos)*m));
	}
    }
    target_lpcres = new_lpcres();
    lpcres_resize_frames(target_lpcres,pm);

    lpos = 0;
    lf0 = 120;
    pm = 0;
    time = 0;
    /* Second pass puts the values in */
    for (t=relation_head(utt_relation(utt,"Target"));
	 t;
	 t=item_next(t), lf0 = f0, lpos = pos) /* changed by dhopkins */
    {
	pos = item_feat_float(t,"pos");
	f0 = item_feat_float(t,"f0");
	if (time == pos) continue;
	m = (f0-lf0)/(pos-lpos);
	for ( ; time < pos; pm++)
	{
	    time += 1/(lf0 + ((time-lpos)*m));
	    target_lpcres->times[pm] = sts_list->sample_rate * time;
	}
    }
    utt_set_feat(utt,"target_lpcres",lpcres_val(target_lpcres));
    return utt;
}
cst_utterance *cg_synth(cst_utterance *utt)
{
    cst_cg_db *cg_db;
    cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));

    cg_make_hmmstates(utt);
    cg_make_params(utt);
    cg_predict_params(utt);
    if (cg_db->spamf0)
    {
	cst_spamf0(utt);
    }
    cg_resynth(utt);

    return utt;
}
static cst_utterance *cg_make_params(cst_utterance *utt)
{
    /* puts in the frame items */
    /* historically called "mcep" but can actually be any random vectors */
    cst_cg_db *cg_db;
    cst_relation *mcep, *mcep_link;
    cst_item *s, *mcep_parent, *mcep_frame;
    int num_frames;
    float start, end;
    float dur_stretch, tok_stretch;

    cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
    mcep = utt_relation_create(utt,"mcep");
    mcep_link = utt_relation_create(utt,"mcep_link");
    end = 0.0;
    num_frames = 0;
    dur_stretch = get_param_float(utt->features,"duration_stretch", 1.0);

    for (s = utt_rel_head(utt,"HMMstate"); s; s=item_next(s))
    {
        start = end;
        tok_stretch = ffeature_float(s,"R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.local_duration_stretch");
        if (tok_stretch == 0)
            tok_stretch = 1.0;
        end = start + (tok_stretch*dur_stretch*cg_state_duration(s,cg_db));
        item_set_float(s,"end",end);
        mcep_parent = relation_append(mcep_link, s);
        for ( ; (num_frames * cg_db->frame_advance) <= end; num_frames++ )
        {
            mcep_frame = relation_append(mcep,NULL);
            item_add_daughter(mcep_parent,mcep_frame);
            item_set_int(mcep_frame,"frame_number",num_frames);
            item_set(mcep_frame,"name",item_feat(mcep_parent,"name"));
        }
    }

    /* Copy duration up onto Segment relation */
    for (s = utt_rel_head(utt,"Segment"); s; s=item_next(s))
        item_set(s,"end",ffeature(s,"R:segstate.daughtern.end"));

    utt_set_feat_int(utt,"param_track_num_frames",num_frames);

    return utt;
}
示例#9
0
cst_utterance *join_units_modified_lpc(cst_utterance *utt)
{
    cst_wave *w = 0;
    cst_lpcres *lpcres;
    const char *resynth_type;
    const cst_val *streaming_info_val;

    resynth_type = get_param_string(utt->features, "resynth_type", "float");

    f0_targets_to_pm(utt);
    concat_units(utt);

    lpcres = val_lpcres(utt_feat_val(utt, "target_lpcres"));

    streaming_info_val = get_param_val(utt->features, "streaming_info", NULL);
    if (streaming_info_val)
    {
        lpcres->asi = val_audio_streaming_info(streaming_info_val);
        lpcres->asi->utt = utt;
    }

    if (cst_streq(resynth_type, "float"))
        w = lpc_resynth(lpcres);
    else if (cst_streq(resynth_type, "fixed"))
    {
        w = lpc_resynth_fixedpoint(lpcres);
    }
    else
    {
        cst_errmsg("unknown resynthesis type %s\n", resynth_type);
        cst_error();            /* Should not happen */
    }

    if (w == NULL)
    {
        /* Synthesis Failed, probably because it was interrupted */
        utt_set_feat_int(utt, "Interrupted", 1);
        w = new_wave();
    }

    utt_set_wave(utt, w);

    return utt;
}
示例#10
0
cst_utterance *asis_to_pm(cst_utterance *utt)
{
    /* Copy the PM structure from the units unchanged */
    cst_item *u;
    cst_lpcres *target_lpcres;
    int  unit_start, unit_end;
    int utt_pms, utt_size, i;
    cst_sts_list *sts_list;

    sts_list = val_sts_list(utt_feat_val(utt,"sts_list"));
    target_lpcres = new_lpcres();

    /* Pass one to find the size */
    utt_pms = utt_size = 0;
    for (u=relation_head(utt_relation(utt,"Unit"));
	 u; 
	 u=item_next(u))
    {
	unit_start = item_feat_int(u,"unit_start");
	unit_end = item_feat_int(u,"unit_end");
	utt_size += get_unit_size(sts_list,unit_start,unit_end);
	utt_pms += unit_end - unit_start;
	item_set_int(u,"target_end",utt_size);
    }
    lpcres_resize_frames(target_lpcres,utt_pms);

    /* Pass two to fill in the values */
    utt_pms = utt_size = 0;
    for (u=relation_head(utt_relation(utt,"Unit"));
	 u; 
	 u=item_next(u))
    {
	unit_start = item_feat_int(u,"unit_start");
	unit_end = item_feat_int(u,"unit_end");
	for (i=unit_start; i<unit_end; i++,utt_pms++)
	{
	    utt_size += get_frame_size(sts_list, i);
	    target_lpcres->times[utt_pms] = utt_size;
	}
    }
    utt_set_feat(utt,"target_lpcres",lpcres_val(target_lpcres));
    return utt;
}
示例#11
0
static cst_utterance *tokentosegs(cst_utterance *u)
{
    cst_item *t;
    cst_relation *seg, *syl, *sylstructure, *word;
    cst_item *sylitem, *sylstructureitem, *worditem, *sssyl;
    cst_phoneset *ps;

    ps = val_phoneset(utt_feat_val(u, "phoneset"));
    /* Just copy tokens into the Segment relation */
    seg = utt_relation_create(u, "Segment");
    syl = utt_relation_create(u, "Syllable");
    word = utt_relation_create(u, "Word");
    sylstructure = utt_relation_create(u, "SylStructure");
    sssyl = sylitem = worditem = sylstructureitem = 0;
    for (t = relation_head(utt_relation(u, "Token")); t; t = item_next(t)) 
    {
	cst_item *segitem = relation_append(seg, NULL);
	char const *pname = item_feat_string(t, "name");
	char *name = cst_strdup(pname);

	if (worditem == 0)
	{
	    worditem = relation_append(word,NULL);
	    item_set_string(worditem, "name", "phonestring");
	    sylstructureitem = relation_append(sylstructure,worditem);
	}
	if (sylitem == 0)
	{
	    sylitem = relation_append(syl,NULL);
	    sssyl = item_add_daughter(sylstructureitem,sylitem);
	}
	
	if (name[cst_strlen(name)-1] == '1')
	{
	    item_set_string(sssyl,"stress","1");
	    name[cst_strlen(name)-1] = '\0';
	}
	else if (name[cst_strlen(name)-1] == '0')
	{
	    item_set_string(sssyl,"stress","0");
	    name[cst_strlen(name)-1] = '\0';
	}

	if (cst_streq(name,"-"))
	{
	    sylitem = 0;  /* syllable break */
	}
	else if (phone_id(ps, name) == -1) 
	{
	    cst_errmsg("Phone `%s' not in phoneset\n", pname);
	    cst_error();
	}
	else
	{
	    item_add_daughter(sssyl,segitem);
	    item_set_string(segitem, "name", name);
	}

	cst_free(name);
    }

    return u;
}
示例#12
0
static cst_utterance *cg_predict_params(cst_utterance *utt)
{
    cst_cg_db *cg_db;
    cst_track *param_track;
    cst_track *str_track = NULL;
    cst_item *mcep;
    const cst_cart *mcep_tree, *f0_tree;
    int i,j,f,p,fd,o;
    const char *mname;
    float f0_val;
    int fff;
    int extra_feats = 0;

    cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
    param_track = new_track();
    if (cg_db->do_mlpg) /* which should be the default */
        fff = 1;  /* copy details with stddevs */
    else
        fff = 2;  /* copy details without stddevs */

    extra_feats = 1;  /* voicing */
    if (cg_db->mixed_excitation)
    {
        extra_feats += 5;
        str_track = new_track();
        cst_track_resize(str_track,
                         utt_feat_int(utt,"param_track_num_frames"),
                         5);
    }
    
    cst_track_resize(param_track,
                     utt_feat_int(utt,"param_track_num_frames"),
                     (cg_db->num_channels0/fff)-
                       (2 * extra_feats));/* no voicing or str */
    for (i=0,mcep=utt_rel_head(utt,"mcep"); mcep; i++,mcep=item_next(mcep))
    {
        mname = item_feat_string(mcep,"name");
        for (p=0; cg_db->types[p]; p++)
            if (cst_streq(mname,cg_db->types[p]))
                break;
        if (cg_db->types[0] == NULL)
            p=0; /* if there isn't a matching tree, use the first one */

        /* Predict F0 */
        f0_tree = cg_db->f0_trees[p];
        f0_val = val_float(cart_interpret(mcep,f0_tree));
        param_track->frames[i][0] = f0_val;
        /* what about stddev ? */

        if (cg_db->multimodel)
        {   /* MULTI model */
            f = val_int(cart_interpret(mcep,cg_db->param_trees0[p]));
            fd = val_int(cart_interpret(mcep,cg_db->param_trees1[p]));
            item_set_int(mcep,"clustergen_param_frame",f);

            param_track->frames[i][0] = 
                (param_track->frames[i][0]+
                 CG_MODEL_VECTOR(cg_db,model_vectors0,f,0)+
                 CG_MODEL_VECTOR(cg_db,model_vectors1,fd,0))/3.0;
            for (j=2; j<param_track->num_channels; j++)
                param_track->frames[i][j] = 
                    (CG_MODEL_VECTOR(cg_db,model_vectors0,f,(j)*fff)+
                     CG_MODEL_VECTOR(cg_db,model_vectors1,fd,(j)*fff))/2.0;
            if (cg_db->mixed_excitation)
            {
                o = j;
                for (j=0; j<5; j++)
                {
                    str_track->frames[i][j] =
                        (CG_MODEL_VECTOR(cg_db,model_vectors0,f,(o+(2*j))*fff)+
                         CG_MODEL_VECTOR(cg_db,model_vectors1,fd,(o+(2*j))*fff))/2.0;
                }
            }
        }
        else  
        {   /* SINGLE model */
            /* Predict Spectral */
            mcep_tree = cg_db->param_trees0[p];
            f = val_int(cart_interpret(mcep,mcep_tree));
            item_set_int(mcep,"clustergen_param_frame",f);

            param_track->frames[i][0] = 
                (param_track->frames[i][0]+
                 CG_MODEL_VECTOR(cg_db,model_vectors0,f,0))/2.0;

            for (j=2; j<param_track->num_channels; j++)
                param_track->frames[i][j] =
                    CG_MODEL_VECTOR(cg_db,model_vectors0,f,(j)*fff);

            if (cg_db->mixed_excitation)
            {
                o = j;
                for (j=0; j<5; j++)
                {
                    str_track->frames[i][j] =
                        CG_MODEL_VECTOR(cg_db,model_vectors0,f,(o+(2*j))*fff);
                }
            }
        }

        /* last coefficient is average voicing for cluster */
        item_set_float(mcep,"voicing",
                       CG_MODEL_VECTOR(cg_db,model_vectors0,f,
                                       cg_db->num_channels0-2));

        param_track->times[i] = i * cg_db->frame_advance;
    }

    cg_smooth_F0(utt,cg_db,param_track);

    utt_set_feat(utt,"param_track",track_val(param_track));
    if (cg_db->mixed_excitation)
        utt_set_feat(utt,"str_track",track_val(str_track));

    return utt;
}