cst_utterance *russian_postlex_function(cst_utterance *u) { const cst_item *word,*seg; const char *answer,*name,*pair; for(word=relation_head(utt_relation(u,"Transcription"));word;word=item_next(word)) { if(item_feat_present(word,"no_pl")||item_feat_present(word,"no_vr")) continue; for(seg=item_daughter(word);seg;seg=item_next(seg)) { name=item_feat_string(seg,"name"); if(cst_member_string(name,unstressed_vowels)) { answer=val_string(cart_interpret(item_as(seg,"Segment"),&ru_vowel_reduction_cart)); if(!cst_streq(answer,"N")) item_set_string(seg,"name",answer); } else { if(cst_streq(name,"ii")&& cst_streq(ffeature_string(seg,"R:Segment.p.ph_csoft"),"-")&& !(cst_streq(item_feat_string(word,"name"),"и")&& cst_streq(ffeature_string(word,"gpos"),"content"))) { item_set_string(seg,"name","yy"); } } } } for(word=relation_tail(utt_relation(u,"Transcription"));word;word=item_prev(word)) { if(item_feat_present(word,"no_pl")) continue; for(seg=item_last_daughter(word);seg;seg=item_prev(seg)) { name=item_feat_string(seg,"name"); pair=russian_vpair(name); if(pair!=NULL) { answer=val_string(cart_interpret(item_as(seg,"Segment"),&ru_vpair_cart)); if(cst_streq(answer,"Y")) item_set_string(seg,"name",pair); } } } return u; }
static const cst_val *syl_final(const cst_item *seg) { /* last segment in a syllable */ const cst_item *s = item_as(seg,"SylStructure"); if (!s || (item_next(s) == NULL)) return VAL_STRING_1; else return VAL_STRING_0; }
static const cst_val *word_numsyls(const cst_item *word) { cst_item *d; int c; for (c=0,d=item_daughter(item_as(word,"SylStructure")); d; d=item_next(d),c++); return val_int_n(c); }
static const cst_val *pos_in_syl(const cst_item *seg) { const cst_item *s; int c; for (c=-1,s=item_as(seg,"SylStructure"); s; s=item_prev(s),c++); return val_string_n(c); }
static void apostrophe_s(cst_utterance *u) { cst_item *s; cst_item *schwa; const cst_phoneset *ps = u->vox->phoneset; const char *pname, *word; for (s=item_next(UTT_REL_HEAD(u,SEGMENT)); s; s=item_next(s)) { word = val_string(ffeature(s, "R:"SYLSTRUCTURE".P.P.name")); if (cst_streq("'s", word)) { pname = item_feat_string(item_prev(s),"name"); if ((strchr("fa",*phone_feature_string(ps,pname,"ctype")) != NULL) && (strchr("dbg", *phone_feature_string(ps,pname,"cplace")) == NULL)) /* needs a schwa */ { schwa = item_prepend(s,NULL); item_set_string(schwa,"name","ax"); item_prepend(item_as(s,SYLSTRUCTURE),schwa); } else if (cst_streq("-",phone_feature_string(ps,pname,"cvox"))) item_set_string(s,"name","s"); } else if (cst_streq("'ve", word) || cst_streq("'ll", word) || cst_streq("'d", word)) { if (cst_streq("-",ffeature_string(s,"p."PH_VC))) { schwa = item_prepend(s,NULL); item_set_string(schwa,"name","ax"); item_prepend(item_as(s,SYLSTRUCTURE),schwa); } } } }
static const cst_val *seg_onsetcoda(const cst_item *seg) { const cst_item *s; const cst_phoneset *ps = item_phoneset(seg); for (s=item_next(item_as(seg,"SylStructure")); s; s=item_next(s)) { if (cst_streq("+",phone_feature_string(ps,item_feat_string(s,"name"), "vc"))) return (cst_val *)&val_string_onset; } return (cst_val *)&val_string_coda; }
static const cst_val *segment_duration(const cst_item *seg) { const cst_item *s = item_as(seg,"Segment"); if (!s) return VAL_STRING_0; else if (item_prev(s) == NULL) return item_feat(s,"end"); else /* It should be okay to construct this as it will get dereferenced when the CART interpreter frees its feature cache. */ return float_val(item_feat_float(s,"end") - item_feat_float(item_prev(s),"end")); }
static const cst_val *last_accent(const cst_item *syl) { const cst_item *s; int c; for (c=0,s=item_as(syl,"Syllable"); s && (c < CST_CONST_INT_MAX); s=item_prev(s),c++) { if (val_int(accented(s))) return val_string_n(c); } return val_string_n(c); }
static const cst_val *syl_onsetsize(const cst_item *syl) { cst_item *d; int c; for (c=0,d=item_daughter(item_as(syl,"SylStructure")); d; d=item_next(d),c++) { if (cst_streq("+",val_string(ph_vc(d)))) break; } return val_string_n(c); }
static const cst_val *syl_break(const cst_item *syl) { /* Break level after this syllable */ cst_item *ss; ss = item_as(syl,"SylStructure"); if (ss == NULL) return VAL_STRING_1; /* hmm, no sylstructure */ else if (item_next(ss) != NULL) return VAL_STRING_0; /* word internal */ else if (item_parent(ss) == NULL) /* no parent */ return VAL_STRING_1; else return word_break(item_parent(ss)); }
static const cst_val *syl_in(const cst_item *syl) { /* Number of syllables since last major break */ const cst_item *ss,*p,*fs; int c; ss = item_as(syl,"Syllable"); fs = path_to_item(syl,"R:SylStructure.parent.R:Phrase.parent.daughter.R:SylStructure.daughter"); for (c=0, p=ss; p && (c < CST_CONST_INT_MAX); p=item_prev(p),c++) if (item_equal(p,fs)) break; return val_string_n(c); }
static cst_val* word_to_phones(const cst_item *word) { cst_val*phones=NULL; const char *name=item_feat_string(word, "name"); ustring32_t letters=ustring32_alloc(0); if(letters==NULL) return NULL; ustring32_assign8(letters,(const uint8_t*)name); if(ustring32_empty(letters)) { ustring32_free(letters); return NULL; } unsigned int flags=classify_characters(ustring32_str(letters),ustring32_length(letters)); int variant=item_feat_int(item_parent(item_as(word,"Token")),"variant"); if((flags&cs_lc)&&cst_streq(ffeature_string(word,"gpos"),"content")) { if(variant==variant_pseudo_english) phones=ustring32_lts_apply(letters,&en_consonants_lts); else phones=ustring32_lts_apply(letters,&ru_consonants_lts); item_set_int(word,"no_vr",1); } else if((variant==variant_pseudo_english)&&(flags&cs_en)) { cst_val *en_phones=lex_lookup(en_lex,name,(cst_streq(name,"a")?"n":NULL)); if(en_phones) { phones=ru_lts_apply(en_phones,&ru_en_lts); delete_val(en_phones); } item_set_int(word,"no_pl",1); } else { const ru_dict_entry *e=bsearch(name,ru_dict,ru_dict_size,sizeof(ru_dict_entry),compare_entries); if(e!=NULL) { if(e->stress > 0) ustring32_set(letters,e->stress-1,1105); else item_set_int(word,"stressed_syl_num",e->stress); } phones=ustring32_lts_apply(letters,&ru_lts); } ustring32_free(letters); return phones; }
static const cst_val *position_type(const cst_item *syl) { const cst_item *s = item_as(syl,"SylStructure"); if (s == 0) return (cst_val *)&val_string_single; else if (item_next(s) == 0) { if (item_prev(s) == 0) return (cst_val *)&val_string_single; else return (cst_val *)&val_string_final; } else if (item_prev(s) == 0) return (cst_val *)&val_string_initial; else return (cst_val *)&val_string_mid; }
static const cst_val *word_punc(const cst_item *word) { cst_item *ww; const cst_val *v; ww = item_as(word,"Token"); if ((ww != NULL) && (item_next(ww) != 0)) v = &val_string_empty; else v = ffeature(item_parent(ww),"punc"); /* printf("word_punc word %s punc %s\n", item_feat_string(ww,"name"), val_string(v)); */ return v; }
static const cst_val *seg_onset_ctype(const cst_item *seg, const char *ctype) { const cst_item *s; const cst_phoneset *ps = item_phoneset(seg); for (s=item_daughter(item_parent(item_as(seg,"SylStructure"))); s; s=item_next(s)) { if (cst_streq("+",phone_feature_string(ps,item_feat_string(s,"name"), "vc"))) return VAL_STRING_0; if (cst_streq(ctype,phone_feature_string(ps,item_feat_string(s,"name"), "ctype"))) return VAL_STRING_1; } return VAL_STRING_0; }
static const cst_val *word_break(const cst_item *word) { cst_item *ww,*pp; const char *pname; ww = item_as(word,"Phrase"); if ((ww == NULL) || (item_next(ww) != 0)) return VAL_STRING_1; else { pp = item_parent(ww); pname = val_string(item_feat(pp,"name")); if (cst_streq("BB",pname)) return VAL_STRING_4; else if (cst_streq("B",pname)) return VAL_STRING_3; else return VAL_STRING_1; } }
static const cst_val *ssyl_in(const cst_item *syl) { /* Number of stressed syllables since last major break */ const cst_item *ss,*p,*fs; int c; ss = item_as(syl,"Syllable"); fs = path_to_item(syl,"R:SylStructure.parent.R:Phrase.parent.daughter.R:SylStructure.daughter"); /* This should actually include the first syllable, but Festival's doesn't. */ for (c=0, p=item_prev(ss); p && (!item_equal(p,fs)) && (c < CST_CONST_INT_MAX); p=item_prev(p)) { if (cst_streq("1",item_feat_string(p,"stress"))) c++; } return val_string_n(c); /* its used randomly as int and float */ }
static const cst_val *ssyl_out(const cst_item *syl) { /* Number of stressed syllables until last major break */ const cst_item *ss,*p,*fs; int c; ss = item_as(syl,"Syllable"); fs = path_to_item(syl,"R:SylStructure.parent.R:Phrase.parent.daughtern.R:SylStructure.daughtern"); for (c=0, p=item_next(ss); p && (c < CST_CONST_INT_MAX); p=item_next(p)) { if (cst_streq("1",item_feat_string(p,"stress"))) c++; if (item_equal(p,fs)) break; } return val_string_n(c); /* its used randomly as int and float */ }
cst_utterance *default_lexical_insertion(cst_utterance *u) { cst_item *word; cst_relation *sylstructure,*seg,*syl; cst_lexicon *lex; const cst_val *lex_addenda = NULL; const cst_val *p, *wp = NULL; char *phone_name; char *stress = "0"; const char *pos; cst_val *phones; cst_item *ssword, *sssyl, *segitem, *sylitem, *seg_in_syl; lex = val_lexicon(feat_val(u->features,"lexicon")); if (lex->lex_addenda) lex_addenda = lex->lex_addenda; syl = utt_relation_create(u,"Syllable"); sylstructure = utt_relation_create(u,"SylStructure"); seg = utt_relation_create(u,"Segment"); for (word=relation_head(utt_relation(u,"Word")); word; word=item_next(word)) { ssword = relation_append(sylstructure,word); pos = ffeature_string(word,"pos"); phones = NULL; wp = NULL; /* printf("awb_debug word %s pos %s gpos %s\n", item_feat_string(word,"name"), pos, ffeature_string(word,"gpos")); */ /* FIXME: need to make sure that textanalysis won't split tokens with explicit pronunciation (or that it will propagate such to words, then we can remove the path here) */ if (item_feat_present(item_parent(item_as(word, "Token")), "phones")) phones = (cst_val *) item_feat(item_parent(item_as(word, "Token")), "phones"); else { wp = val_assoc_string(item_feat_string(word, "name"),lex_addenda); if (wp) phones = (cst_val *)val_cdr(val_cdr(wp)); else phones = lex_lookup(lex,item_feat_string(word,"name"),pos); } for (sssyl=NULL,sylitem=NULL,p=phones; p; p=val_cdr(p)) { if (sylitem == NULL) { sylitem = relation_append(syl,NULL); sssyl = item_add_daughter(ssword,sylitem); stress = "0"; } segitem = relation_append(seg,NULL); phone_name = cst_strdup(val_string(val_car(p))); if (phone_name[cst_strlen(phone_name)-1] == '1') { stress = "1"; phone_name[cst_strlen(phone_name)-1] = '\0'; } else if (phone_name[cst_strlen(phone_name)-1] == '0') { stress = "0"; phone_name[cst_strlen(phone_name)-1] = '\0'; } item_set_string(segitem,"name",phone_name); seg_in_syl = item_add_daughter(sssyl,segitem); #if 0 printf("awb_debug ph %s\n",phone_name); #endif if ((lex->syl_boundary)(seg_in_syl,val_cdr(p))) { #if 0 printf("awb_debug SYL\n"); #endif sylitem = NULL; if (sssyl) item_set_string(sssyl,"stress",stress); } cst_free(phone_name); } if (!item_feat_present(item_parent(item_as(word, "Token")), "phones") && ! wp) delete_val(phones); } return u; }
cst_utterance *russian_lexical_insertion(cst_utterance *u) { cst_item *word; cst_relation *sylstructure,*seg,*syl,*sylvowel,*transcription; const cst_val *p; const char *phone_name; cst_val *phones; cst_item *ssword, *sssyl, *segitem, *sylitem, *seg_in_syl, *svsyl, *vowel_in_syl, *tword, *seg_in_word; cst_item *i,*tmp; int num_segs; int total_num_segs=0; syl = utt_relation_create(u,"Syllable"); sylstructure = utt_relation_create(u,"SylStructure"); seg = utt_relation_create(u,"Segment"); sylvowel = utt_relation_create(u,"SylVowel"); transcription = utt_relation_create(u,"Transcription"); for (word=relation_head(utt_relation(u,"Word"));word;word=item_next(word)) { phones=word_to_phones(word); if(!phones) continue; num_segs=val_length(phones); if((total_num_segs+num_segs)>max_num_segs) { delete_val(phones); break; } ssword = relation_append(sylstructure,word); tword = relation_append(transcription,word); for (sssyl=NULL,sylitem=NULL,p=phones; p; p=val_cdr(p)) { if (sylitem == NULL) { sylitem = relation_append(syl,NULL); sssyl = item_add_daughter(ssword,sylitem); } segitem = relation_append(seg,NULL); phone_name = val_string(val_car(p)); item_set_string(segitem,"name",phone_name); seg_in_syl = item_add_daughter(sssyl,segitem); seg_in_word = item_add_daughter(tword,segitem); if(is_vowel(phone_name)) { svsyl=relation_append(sylvowel,sylitem); vowel_in_syl=item_add_daughter(svsyl,segitem); } if (ru_syl_boundary(seg_in_syl,val_cdr(p))) { sylitem = NULL; if (sssyl) item_set_string(sssyl,"stress","0"); } } assign_stress(word); delete_val(phones); total_num_segs+=num_segs; } i=relation_head(utt_relation(u,"Word")); while(i) { tmp=item_next(i); if(item_as(i,"Transcription")==NULL) { delete_item(item_as(i,"Token")); delete_item(item_as(i,"Phrase")); delete_item(i); } i=tmp; } i=relation_head(utt_relation(u,"Phrase")); while(i) { tmp=item_next(i); if(item_daughter(i)==NULL) delete_item(i); i=tmp; } return u; }
cst_utterance *default_lexical_insertion(cst_utterance *u) { cst_item *word; cst_relation *sylstructure,*seg,*syl; cst_lexicon *lex, *ulex = NULL; const cst_val *p; char *phone_name; char *stress = "0"; cst_val *phones; cst_item *ssword, *sssyl, *segitem, *sylitem, *seg_in_syl; lex = val_lexicon(feat_val(u->features,"lexicon")); if (feat_present(u->features, "user_lexicon")) ulex = val_lexicon(feat_val(u->features, "user_lexicon")); syl = utt_relation_create(u,"Syllable"); sylstructure = utt_relation_create(u,"SylStructure"); seg = utt_relation_create(u,"Segment"); for (word=relation_head(utt_relation(u,"Word")); word; word=item_next(word)) { ssword = relation_append(sylstructure,word); phones = NULL; /* FIXME: need to make sure that textanalysis won't split tokens with explicit pronunciation (or that it will propagate such to words, then we can remove the path here) */ if (item_feat_present(item_parent(item_as(word, "Token")), "phones")) phones = (cst_val *) item_feat(item_parent(item_as(word, "Token")), "phones"); else { if (ulex) phones = lex_lookup(ulex,item_feat_string(word, "name"),0); if (phones == NULL) phones = lex_lookup(lex,item_feat_string(word,"name"),0); } for (sssyl=NULL,sylitem=NULL,p=phones; p; p=val_cdr(p)) { if (sylitem == NULL) { sylitem = relation_append(syl,NULL); sssyl = item_add_daughter(ssword,sylitem); stress = "0"; } segitem = relation_append(seg,NULL); phone_name = cst_strdup(val_string(val_car(p))); if (phone_name[strlen(phone_name)-1] == '1') { stress = "1"; phone_name[strlen(phone_name)-1] = '\0'; } else if (phone_name[strlen(phone_name)-1] == '0') { stress = "0"; phone_name[strlen(phone_name)-1] = '\0'; } item_set_string(segitem,"name",phone_name); seg_in_syl = item_add_daughter(sssyl,segitem); if ((lex->syl_boundary)(seg_in_syl,val_cdr(p))) { sylitem = NULL; if (sssyl) item_set_string(sssyl,"stress",stress); } cst_free(phone_name); } if (!item_feat_present(item_parent(item_as(word, "Token")), "phones")) delete_val(phones); } return u; }
static void assign_stress(cst_item *word) { int numsyls=ffeature_int(word,"word_numsyls"); const char *gpos=ffeature_string(word,"gpos"); const cst_item *word_in_phrase=item_as(word,"Phrase"); const cst_item *syls=item_as(word,"SylStructure"); const cst_item *syl=item_daughter(syls); const cst_item *transcription=item_as(word,"Transcription"); int stressed=FALSE; int n=item_feat_present(word,"stressed_syl_num")?item_feat_int(word,"stressed_syl_num"):0; const char *name=item_name(word); const char *pname=ffeature_string(word,"R:Phrase.p.name"); const char *nname=ffeature_string(word,"R:Phrase.n.name"); if(cst_streq(ffeature_string(word,"R:Token.p.name"),"по")&& (cst_streq(name,"моему")||cst_streq(name,"своему")||cst_streq(name,"твоему"))) { item_set_string(syl,"stress","1"); return; } else if((cst_streq(name,"не")||cst_streq(name,"ни"))&& (cst_streq(nname,"был")||cst_streq(nname,"были")||cst_streq(nname,"было"))) { item_set_string(syl,"stress","1"); return; } else if((cst_streq(name,"был")||cst_streq(name,"были")||cst_streq(name,"было"))&& (cst_streq(pname,"не")||cst_streq(pname,"ни"))) return; else if(cst_streq(gpos,"enc")&&item_prev(word_in_phrase)) return; else if(cst_streq(gpos,"proc")&&item_next(word_in_phrase)) return; if(!vowel_seg_between(item_daughter(transcription),item_last_daughter(transcription))) return; if(numsyls==1) { item_set_string(syl,"stress","1"); return; } for(;syl;syl=item_next(syl)) { if(is_stressed_vowel(item_feat_string(item_daughter(item_as(syl,"SylVowel")),"name"))) { item_set_string(syl,"stress","1"); stressed=TRUE; } } if(n==0) { if(stressed) return; n=val_int(cart_interpret(word,&ru_stress_cart)); if((numsyls+n) < 0) { if(numsyls <= 4) n=-2; else if(numsyls <= 6) n=-3; else n=-4; } } item_set_string(item_nth_daughter(syls,(numsyls+n)),"stress","1"); }