cst_val *en_exp_letters(const char *lets) { /* returns these as list of single char symbols */ char *aaa; cst_val *r; int i; aaa = cst_alloc(char,2); aaa[1] = '\0'; for (r=0,i=0; lets[i] != '\0'; i++) { aaa[0] = lets[i]; if (isupper((int)aaa[0])) aaa[0] = tolower((int)aaa[0]); if (strchr("0123456789",aaa[0])) r = cons_val(string_val(digit2num[aaa[0]-'0']),r); else if (cst_streq(aaa,"a")) r = cons_val(string_val("_a"),r); else r = cons_val(string_val(aaa),r); } cst_free(aaa); return val_reverse(r); }
cst_val *en_exp_real(const char *numstring) { char *aaa, *p; cst_val *r; if (numstring && (numstring[0] == '-')) r = cons_val(string_val("minus"), en_exp_real(&numstring[1])); else if (numstring && (numstring[0] == '+')) r = cons_val(string_val("plus"), en_exp_real(&numstring[1])); else if (((p=strchr(numstring,'e')) != 0) || ((p=strchr(numstring,'E')) != 0)) { aaa = cst_strdup(numstring); aaa[cst_strlen(numstring)-cst_strlen(p)] = '\0'; r = val_append(en_exp_real(aaa), cons_val(string_val("e"), en_exp_real(p+1))); cst_free(aaa); } else if ((p=strchr(numstring,'.')) != 0) { aaa = cst_strdup(numstring); aaa[cst_strlen(numstring)-cst_strlen(p)] = '\0'; r = val_append(en_exp_number(aaa), cons_val(string_val("point"), en_exp_digits(p+1))); cst_free(aaa); } else r = en_exp_number(numstring); /* I don't think you can get here */ return r; }
static cst_val *cmu_LANGNAME_tokentowords(cst_item *token, const char *name) { /* Return list of words that expand token/name */ cst_val *r; /* printf("token_name %s name %s\n",item_name(token),name); */ if (item_feat_present(token,"phones")) return cons_val(string_val(name),NULL); #if 0 if (item_feat_present(token,"nsw")) nsw = item_feat_string(token,"nsw"); utt = item_utt(token); lex = val_lexicon(feat_val(utt->features,"lexicon")); #endif if (cst_strlen(name) > 0) r = cons_val(string_val(name),0); else r = NULL; return r; }
cst_val *lex_lookup(const cst_lexicon *l, const char *word, const char *pos, const cst_features *feats) { int index; int p; const char *q; char *wp; cst_val *phones = 0; int found = FALSE; wp = cst_alloc(char,cst_strlen(word)+2); cst_sprintf(wp,"%c%s",(pos ? pos[0] : '0'),word); if (l->addenda) phones = lex_lookup_addenda(wp,l,&found); if (!found) { index = lex_lookup_bsearch(l,wp); if (index >= 0) { if (l->phone_hufftable) { for (p=index-2; l->data[p]; p--) for (q=l->phone_hufftable[l->data[p]]; *q; q++) phones = cons_val(string_val(l->phone_table[(unsigned char)*q]), phones); } else /* no compression -- should we still support this ? */ { for (p=index-2; l->data[p]; p--) phones = cons_val(string_val(l->phone_table[l->data[p]]), phones); } phones = val_reverse(phones); } else if (l->lts_function) { phones = (l->lts_function)(l,word,"",feats); } else if (l->lts_rule_set) { phones = lts_apply(word, "", /* more features if we had them */ l->lts_rule_set); } } cst_free(wp); return phones; }
static cst_val *add_lts_boundary_marks(const cst_val *l) { cst_val *l1; const cst_val *v; l1 = cons_val(string_val("#"),NULL); for (v=l;v;v=val_cdr(v)) { l1=cons_val(val_car(v),l1); } l1 = cons_val(string_val("#"),l1); l1 = val_reverse(l1); return l1; }
cst_val *en_exp_ordinal(const char *rawnumstring) { /* return ordinal for digit string */ cst_val *card, *o; const cst_val *t; const char *l; const char *ord; char *numstring; int i,j; numstring = cst_strdup(rawnumstring); for (j=i=0; i < cst_strlen(rawnumstring); i++) if (rawnumstring[i] != ',') { numstring[j] = rawnumstring[i]; j++; } numstring[j] = '\0'; card = val_reverse(en_exp_number(numstring)); cst_free(numstring); l = val_string(val_car(card)); ord = 0; for (i=0; i<10; i++) if (cst_streq(l,digit2num[i])) ord = ord2num[i]; if (!ord) for (i=0; i<10; i++) if (cst_streq(l,digit2teen[i])) ord = ord2teen[i]; if (!ord) for (i=0; i<10; i++) if (cst_streq(l,digit2enty[i])) ord = ord2enty[i]; if (cst_streq(l,"hundred")) ord = "hundredth"; if (cst_streq(l,"thousand")) ord = "thousandth"; if (cst_streq(l,"billion")) ord = "billtionth"; if (!ord) /* dunno, so don't convert anything */ return card; o = cons_val(string_val(ord),0); for (t=val_cdr(card); t; t=val_cdr(t)) o = cons_val(val_car(t),o); delete_val(card); return o; }
cst_val *get_wavelist(const char *wavelistfile) { cst_val *l = 0; cst_tokenstream *ts; const char *token; int i=0; ts = ts_open(wavelistfile); if (!ts) { fprintf(stderr,"combine_waves: can't open \"%s\"\n",wavelistfile); return 0; } while ((token=ts_get(ts)) != 0) { l = cons_val(string_val(token),l); i++; } if (i%2 != 0) { fprintf(stderr,"combine_waves: doesn't have matched pairs \"%s\"\n",wavelistfile); delete_val(l); l = 0; } ts_close(ts); return val_reverse(l); }
static cst_val *lex_lookup_addenda(const char *wp,const cst_lexicon *l, int *found) { /* For those other words */ int i,j; cst_val *phones; phones = NULL; for (i=0; l->addenda[i]; i++) { if (((wp[0] == '0') || (wp[0] == l->addenda[i][0][0]) || (l->addenda[i][0][0] == '0')) && (cst_streq(wp+1,l->addenda[i][0]+1))) { for (j=1; l->addenda[i][j]; j++) phones = cons_val(string_val(l->addenda[i][j]),phones); *found = TRUE; return val_reverse(phones); } } return NULL; }
cst_val *en_exp_digits(const char *numstring) { /* Expand given token to list of words pronouncing it as digits */ cst_val *d = 0; const char *p; for (p=numstring; *p; p++) { if ((*p >= '0') && (*p <= '9')) d = cons_val(string_val(digit2num[*p-'0']),d); else d = cons_val(string_val("umpty"),d); } return val_reverse(d); }
cst_val *cst_args(char **argv, int argc, const char *description, cst_features *args) { /* parses the given arguments wrt the description */ cst_features *op_types = new_features(); cst_val *files = NULL; int i; const char *type; parse_description(description,op_types); for (i=1; i<argc; i++) { if (argv[i][0] == '-') { if ((!feat_present(op_types,argv[i])) || (cst_streq("-h",argv[i])) || (cst_streq("-?",argv[i])) || (cst_streq("--help",argv[i])) || (cst_streq("-help",argv[i]))) parse_usage(argv[0],"","",description); else { type = feat_string(op_types,argv[i]); if (cst_streq("<binary>",type)) feat_set_string(args,argv[i],"true"); else { if (i+1 == argc) parse_usage(argv[0], "missing argument for ",argv[i], description); if (cst_streq("<int>",type)) feat_set_int(args,argv[i],atoi(argv[i+1])); else if (cst_streq("<float>",type)) feat_set_float(args,argv[i],atof(argv[i+1])); else if (cst_streq("<string>",type)) feat_set_string(args,argv[i],argv[i+1]); else parse_usage(argv[0], "unknown arg type ",type, description); i++; } } } else files = cons_val(string_val(argv[i]),files); } delete_features(op_types); return val_reverse(files); }
cst_val* ustring32_lts_apply(const ustring32_t u32,const cst_lts_rewrites *rule) { size_t n=ustring32_length(u32); if(n==0) return NULL; cst_val *l=cons_val(string_val("#"),NULL); uint8_t b[8]; size_t i=n; int k; do { i--; k=u8_uctomb(b,ustring32_at(u32,i),sizeof(b)); b[k]='\0'; l=cons_val(string_val((char*)b),l); } while(i); l=cons_val(string_val("#"),l); cst_val *output=lts_rewrites(l, rule); delete_val(l); return output; }
cst_val *en_exp_id(const char *numstring) { /* Expand numstring as pairs as in years or ids */ char aaa[3]; if ((strlen(numstring) == 4) && (numstring[2] == '0') && (numstring[3] == '0')) { if (numstring[1] == '0') return en_exp_number(numstring); /* 2000, 3000 */ else { aaa[0] = numstring[0]; aaa[1] = numstring[1]; aaa[2] = '\0'; return val_append(en_exp_number(aaa), cons_val(string_val("hundred"),0)); } } else if ((strlen(numstring) == 2) && (numstring[0] == '0')) return cons_val(string_val("oh"), en_exp_digits(&numstring[1])); else if (((strlen(numstring) == 4) && ((numstring[1] == '0'))) || (strlen(numstring) < 3)) return en_exp_number(numstring); else if (strlen(numstring)%2 == 1) { return cons_val(string_val(digit2num[numstring[0]-'0']), en_exp_id(&numstring[1])); } else { aaa[0] = numstring[0]; aaa[1] = numstring[1]; aaa[2] = '\0'; return val_append(en_exp_number(aaa),en_exp_id(&numstring[2])); } }
cst_val *lts_rewrites_word(const char *word, const cst_lts_rewrites *r) { cst_val *w, *p; char x[2]; int i; x[1] = '\0'; w = cons_val(string_val("#"),NULL); for (i=0; word[i]; i++) { x[0] = word[i]; w = cons_val(string_val(x),w); } w = cons_val(string_val("#"),w); w = val_reverse(w); p = lts_rewrites(w,r); delete_val(w); return p; }
cst_val *lts_rewrites(const cst_val *itape, const cst_lts_rewrites *r) { /* Returns list of rewritten "letters" to "phones" by r */ cst_val *LC; const cst_val *RC, *i; const cst_val *rule; cst_val *otape; LC = cons_val(val_car(itape),NULL); RC = val_cdr(itape); otape = NULL; while (val_cdr(RC)) { rule = find_rewrite_rule(LC,RC,r); if (!rule) break; /* val_print(stdout,rule); printf("\n"); */ /* Shift itape head */ for (i=val_car(val_cdr(rule)); i; i=val_cdr(i)) { LC = cons_val(val_car(RC),LC); RC = val_cdr(RC); } /* Output things to otape */ for (i=val_car(val_cdr(val_cdr(val_cdr(rule)))); i; i=val_cdr(i)) otape = cons_val(val_car(i),otape); } delete_val_list(LC); return val_reverse(otape); }
cst_val *cst_utf8_explode(const cst_string *utf8string) { /* return a list of utf-8 characters as strings */ const unsigned char *xxx = (const unsigned char *)utf8string; cst_val *chars=NULL; int i, l=0; char utf8char[5]; #import "OpenEarsStaticAnalysisToggle.h" #ifdef STATICANALYZEDEPENDENCIES #define __clang_analyzer__ 1 #endif #if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES) #undef __clang_analyzer__ for (i=0; xxx[i]; i++) { if (xxx[i] < 0x80) /* one byte */ { sprintf(utf8char,"%c",xxx[i]); l = 1; } else if (xxx[i] < 0xe0) /* two bytes */ { sprintf(utf8char,"%c%c",xxx[i],xxx[i+1]); i++; l = 2; } else if (xxx[i] < 0xff) /* three bytes */ { sprintf(utf8char,"%c%c%c",xxx[i],xxx[i+1],xxx[i+2]); i++; i++; l = 3; } else { sprintf(utf8char,"%c%c%c%c",xxx[i],xxx[i+1],xxx[i+2],xxx[i+3]); i++; i++; i++; l = 4; } chars = cons_val(string_val(utf8char),chars); } return val_reverse(chars); #endif }
cst_val *cmu_grapheme_lex_lts_function(const struct lexicon_struct *l, const char *word, const char *pos, const cst_features *feats) { cst_val *phones = 0; cst_val *utflets = 0; const cst_val *v; char ord[10]; int i,phindex; /* string to utf8 chars */ utflets = cst_utf8_explode(word); for (v=utflets; v; v=val_cdr(v)) { /* We will add the found phones in reverse order and reverse then */ /* afterwards */ cst_utf8_as_hex(val_string(val_car(v)),ord); phindex = cst_find_u2sampa(ord); if (phindex < 0) printf("awb_debug no sampa %s %s\n",val_string(val_car(v)),ord); for (i=4; (phindex>=0) && (i>0); i--) { if (unicode_sampa_mapping[phindex][i]) phones = cons_val(string_val(unicode_sampa_mapping[phindex][i]), phones); } } phones = val_reverse(phones); #if 1 printf("cmu_grapheme_lex.c: word \"%s\" ",word); val_print(stdout,phones); printf("\n"); #endif delete_val(utflets); return phones; }
cst_val *cst_utf8_explode(const cst_string *utf8string) { /* return a list of utf-8 characters as strings */ const unsigned char *xxx = (const unsigned char *)utf8string; cst_val *chars=NULL; int i, l=0; char utf8char[5]; for (i=0; xxx[i]; i++) { if (xxx[i] < 0x80) /* one byte */ { sprintf(utf8char,"%c",xxx[i]); l = 1; } else if (xxx[i] < 0xe0) /* two bytes */ { sprintf(utf8char,"%c%c",xxx[i],xxx[i+1]); i++; l = 2; } else if (xxx[i] < 0xff) /* three bytes */ { sprintf(utf8char,"%c%c%c",xxx[i],xxx[i+1],xxx[i+2]); i++; i++; l = 3; } else { sprintf(utf8char,"%c%c%c%c",xxx[i],xxx[i+1],xxx[i+2],xxx[i+3]); i++; i++; i++; l = 4; } chars = cons_val(string_val(utf8char),chars); } return val_reverse(chars); }
cst_val *cst_lex_load_addenda(const cst_lexicon *lex, const char *lexfile) { /* Load an addend from given file, check its phones wrt lex */ cst_tokenstream *lf; const cst_string *line; cst_val *e = NULL; cst_val *na = NULL; int i; lf = ts_open(lexfile,"\n","","",""); if (lf == NULL) { cst_errmsg("lex_add_addenda: cannot open lexicon file\n"); return NULL;; } while (!ts_eof(lf)) { line = ts_get(lf); if (line[0] == '#') continue; /* a comment */ for (i=0; line[i]; i++) { if (line[i] != ' ') break; } if (line[i]) { e = cst_lex_make_entry(lex,line); if (e) na = cons_val(e,na); } else continue; /* a blank line */ } ts_close(lf); return val_reverse(na); }
cst_val *default_tokentowords(cst_item *i) { return cons_val(string_val(item_feat_string(i,"name")), NULL); }
const char *feat_own_string(cst_features *f,const char *n) { f->owned_strings = cons_val(string_val(n),f->owned_strings); return val_string(val_car(f->owned_strings)); }
cst_val *lts_apply(const char *word,const char *feats,const cst_lts_rules *r) { int pos; cst_val *phones=0; cst_lts_letter *fval_buff; cst_lts_letter *full_buff; cst_lts_phone phone; char *left, *right, *p; /* For feature vals for each letter */ fval_buff = cst_alloc(cst_lts_letter, (r->context_window_size*2)+ r->context_extra_feats); /* Buffer with added contexts */ full_buff = cst_alloc(cst_lts_letter, (r->context_window_size*2)+ strlen(word)+1); /* TBD assumes single POS feat */ /* Assumes l_letter is a char and context < 8 */ sprintf(full_buff,"%.*s#%s#%.*s", r->context_window_size-1, "00000000", word, r->context_window_size-1, "00000000"); /* Do the prediction backwards so we don't need to reverse the answer */ for (pos = r->context_window_size + strlen(word) - 1; full_buff[pos] != '#'; pos--) { /* Fill the features buffer for the predictor */ sprintf(fval_buff,"%.*s%.*s%s", r->context_window_size, full_buff+pos-r->context_window_size, r->context_window_size, full_buff+pos+1, feats); if ((full_buff[pos] < 'a') || (full_buff[pos] > 'z')) { /* English specific */ #ifdef EXCESSIVELY_CHATTY cst_errmsg("lts:skipping unknown char \"%c\"\n", full_buff[pos]); #endif continue; } phone = apply_model(fval_buff, r->letter_index[((full_buff[pos])-'a')%26], r->models); /* delete epsilons and split dual-phones */ if (cst_streq("epsilon",r->phone_table[phone])) continue; else if ((p=strchr(r->phone_table[phone],'-')) != NULL) { left = cst_substr(r->phone_table[phone],0, strlen(r->phone_table[phone])-strlen(p)); right = cst_substr(r->phone_table[phone], (strlen(r->phone_table[phone])-strlen(p))+1, (strlen(p)-1)); phones = cons_val(string_val(left), cons_val(string_val(right),phones)); cst_free(left); cst_free(right); } else phones = cons_val(string_val(r->phone_table[phone]),phones); } cst_free(full_buff); cst_free(fval_buff); return phones; }
cst_val *en_exp_number(const char *numstring) { /* Expand given token to list of words pronouncing it as a number */ int num_digits = cst_strlen(numstring); char part[4]; cst_val *p; int i; if (num_digits == 0) return NULL; else if (num_digits == 1) return en_exp_digits(numstring); else if (num_digits == 2) { if (numstring[0] == '0') { if (numstring[1] == '0') return 0; else return cons_val(string_val(digit2num[numstring[1]-'0']),0); } else if (numstring[1] == '0') return cons_val(string_val(digit2enty[numstring[0]-'0']),0); else if (numstring[0] == '1') return cons_val(string_val(digit2teen[numstring[1]-'0']),0); else return cons_val(string_val(digit2enty[numstring[0]-'0']), en_exp_digits(numstring+1)); } else if (num_digits == 3) { if (numstring[0] == '0') return en_exp_number(numstring+1); else return cons_val(string_val(digit2num[numstring[0]-'0']), cons_val(string_val("hundred"), en_exp_number(numstring+1))); } else if (num_digits < 7) { for (i=0; i < num_digits-3; i++) part[i] = numstring[i]; part[i]='\0'; p = en_exp_number(part); if (p == 0) /* no thousands */ return en_exp_number(numstring+i); else return val_append(p,cons_val(string_val("thousand"), en_exp_number(numstring+i))); } else if (num_digits < 10) { for (i=0; i < num_digits-6; i++) part[i] = numstring[i]; part[i]='\0'; p = en_exp_number(part); if (p == 0) /* no millions */ return en_exp_number(numstring+i); else return val_append(p,cons_val(string_val("million"), en_exp_number(numstring+i))); } else if (num_digits < 13) { /* If there are pedantic brits out there, tough!, 10^9 is a billion */ for (i=0; i < num_digits-9; i++) part[i] = numstring[i]; part[i]='\0'; p = en_exp_number(part); if (p == 0) /* no billions */ return en_exp_number(numstring+i); else return val_append(p,cons_val(string_val("billion"), en_exp_number(numstring+i))); } else /* Way too many digits here, to be a number */ { return en_exp_digits(numstring); } }
cst_val *lex_lookup_return_pos(const cst_lexicon *l, const char *word, const char *pos, int *return_pos) { int index,p; char *wp, *buf; cst_val *phones = 0; int found = FALSE; wp = cst_alloc(char,strlen(word)+2); buf = cst_alloc(char,3); sprintf(wp,"%c%s",(pos ? pos[0] : '0'),word); #ifdef CECUM_DEBUG printf ("lex_lookup_return_pos looks for %s \n", word); #endif if (l->addenda) phones = CECUM_lex_lookup_addenda (wp,l,&found); if (!found) { index = CECUM_lex_lookup_bsearch (l, l->entry_index,0,l->num_entries,wp); #ifdef CECUM_DEBUG printf ("lex_lookup_return_pos has found the word at pos %d \n", index); #endif if (index >= 0) { for (p=l->entry_index[index].phone_index; l->phones[p]; p++) phones = cons_val(string_val(l->phone_table[l->phones[p]]), phones); phones = val_reverse(phones); if (!pos) { //aggiungi in testa il pos se non lo ho! *return_pos = TRUE; sprintf(buf,"%c",l->entry_index[index].word_pos[0]); phones = cons_val(string_val(buf), phones); } //val_print(stdout,phones); //printf("\n"); } else { #ifdef CECUM_DEBUG printf ("lex_lookup_return_pos: word not found \n", index); #endif /* E' a partire da questo punto che si verifica in genere il problema su cui stavamo lavorando. Se la parola non viene trovata nel dizionario, viene restituito un indice -1. Ora, lts_rules_set (e lts_apply) non viene eseguita perchè in ifd_lex.c noi abbiamo posto ifd_lex.lts_rule_set = 0; Del resto, nella lingua italiana non sembrano essere fornite rules compatibili con lst_apply, a differenza di quello che accade con la lingua inglese Il puntatore a funzione l->lst_function viene, al contrario, eseguito. Qui la funzione richiamata è italian_lts_function() (dentro ifd_lex.c). Sarà quest'ultima a richiamare più volte una serie di routine di rewrites (è qui che si verifica l'errore su cui stiamo lavorando). Per esempio, è qui che verrà richiamata ifd_mid_lts_rewrites () */ if (l->lts_rule_set) phones = lts_apply(word, "", /* more features if we had them */ l->lts_rule_set); else if (l->lts_function) phones = (l->lts_function)(l,word,""); } } cst_free(wp); cst_free(buf); return phones; }
cst_val *cst_lex_make_entry(const cst_lexicon *lex, const cst_string *entry) { /* if replace then replace entry in addenda of lex with entry */ /* else append entry to addenda of lex */ cst_tokenstream *e; cst_val *phones = NULL; cst_val *ventry; const cst_string *w, *p; cst_string *word; cst_string *pos; int i; e = ts_open_string(entry, cst_ts_default_whitespacesymbols, "","",""); w = ts_get(e); if (w[0] == '"') /* it was a quoted entry */ { /* so reparse it */ ts_close(e); e = ts_open_string(entry, cst_ts_default_whitespacesymbols, "","",""); w = ts_get_quoted_token(e,'"','\\'); } word = cst_strdup(w); p = ts_get(e); if (!cst_streq(":",p)) /* there is a real pos */ { pos = cst_strdup(p); p = ts_get(e); if (!cst_streq(":",p)) /* there is a real pos */ { cst_fprintf(stdout,"add_addenda: lex %s: expected \":\" in %s\n", lex->name, word); cst_free(word); cst_free(pos); ts_close(e); return NULL; } } else pos = cst_strdup("nil"); while (!ts_eof(e)) { p = ts_get(e); /* Check its a legal phone */ for (i=0; lex->phone_table[i]; i++) { if (cst_streq(p,lex->phone_table[i])) break; } if (cst_streq("#",p)) /* comment to end of line */ break; else if (cst_streq("",p)) /* trailing ws at eoln causes this */ break; else if (lex->phone_table[i]) /* Only add it if its a valid phone */ phones = cons_val(string_val(p),phones); else { cst_fprintf(stdout,"add_addenda: lex: %s word %s phone %s not in lexicon phoneset\n", lex->name, word, p); } } ventry = cons_val(string_val(word),cons_val(string_val(pos), val_reverse(phones))); cst_free(word); cst_free(pos); ts_close(e); #if 0 printf("entry: "); val_print(stdout,ventry); printf("\n"); #endif return ventry; }