int ps_add_word(ps_decoder_t *ps, char const *word, char const *phones, int update) { int32 wid, lmwid; ngram_model_t *lmset; s3cipid_t *pron; char **phonestr, *tmp; int np, i, rv; /* Parse phones into an array of phone IDs. */ tmp = ckd_salloc(phones); np = str2words(tmp, NULL, 0); phonestr = ckd_calloc(np, sizeof(*phonestr)); str2words(tmp, phonestr, np); pron = ckd_calloc(np, sizeof(*pron)); for (i = 0; i < np; ++i) { pron[i] = bin_mdef_ciphone_id(ps->acmod->mdef, phonestr[i]); if (pron[i] == -1) { E_ERROR("Unknown phone %s in phone string %s\n", phonestr[i], tmp); ckd_free(phonestr); ckd_free(tmp); ckd_free(pron); return -1; } } /* No longer needed. */ ckd_free(phonestr); ckd_free(tmp); /* Add it to the dictionary. */ if ((wid = dict_add_word(ps->dict, word, pron, np)) == -1) { ckd_free(pron); return -1; } /* No longer needed. */ ckd_free(pron); /* Now we also have to add it to dict2pid. */ dict2pid_add_word(ps->d2p, wid); if ((lmset = ps_get_lmset(ps)) != NULL) { /* Add it to the LM set (meaning, the current LM). In a perfect * world, this would result in the same WID, but because of the * weird way that word IDs are handled, it doesn't. */ if ((lmwid = ngram_model_add_word(lmset, word, 1.0)) == NGRAM_INVALID_WID) return -1; } /* Rebuild the widmap and search tree if requested. */ if (update) { if ((rv = ps_search_reinit(ps->search, ps->dict, ps->d2p) < 0)) return rv; } return wid; }
static int32 ngram_model_set_add_ug(ngram_model_t * base, int32 wid, int32 lweight) { ngram_model_set_t *set = (ngram_model_set_t *) base; int32 *newwid; int32 i, prob; /* At this point the word has already been added to the master model and we have a new word ID for it. Add it to active submodels and track the word IDs. */ newwid = ckd_calloc(set->n_models, sizeof(*newwid)); prob = base->log_zero; for (i = 0; i < set->n_models; ++i) { int32 wprob, n_hist; /* Only add to active models. */ if (set->cur == -1 || set->cur == i) { /* Did this word already exist? */ newwid[i] = ngram_wid(set->lms[i], base->word_str[wid]); if (newwid[i] == NGRAM_INVALID_WID) { /* Add it to the submodel. */ newwid[i] = ngram_model_add_word(set->lms[i], base->word_str[wid], (float32) logmath_exp(base->lmath, lweight)); if (newwid[i] == NGRAM_INVALID_WID) { ckd_free(newwid); return base->log_zero; } } /* Now get the unigram probability for the new word and either * interpolate it or use it (if this is the current model). */ wprob = ngram_ng_prob(set->lms[i], newwid[i], NULL, 0, &n_hist); if (set->cur == i) prob = wprob; else if (set->cur == -1) prob = logmath_add(base->lmath, prob, set->lweights[i] + wprob); } else { newwid[i] = NGRAM_INVALID_WID; } } /* Okay we have the word IDs for this in all the submodels. Now do some complicated memory mangling to add this to the widmap. */ set->widmap = ckd_realloc(set->widmap, base->n_words * sizeof(*set->widmap)); set->widmap[0] = ckd_realloc(set->widmap[0], base->n_words * set->n_models * sizeof(**set->widmap)); for (i = 0; i < base->n_words; ++i) set->widmap[i] = set->widmap[0] + i * set->n_models; memcpy(set->widmap[wid], newwid, set->n_models * sizeof(*newwid)); ckd_free(newwid); return prob; }
int32 ngram_model_add_class(ngram_model_t * model, const char *classname, float32 classweight, char **words, const float32 * weights, int32 n_words) { ngram_class_t *lmclass; glist_t classwords = NULL; int32 i, start_wid = -1; int32 classid, tag_wid; /* Check if classname already exists in model. If not, add it. */ if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) { tag_wid = ngram_model_add_word(model, classname, classweight); if (tag_wid == NGRAM_INVALID_WID) return -1; } if (model->n_classes == 128) { E_ERROR("Number of classes cannot exceed 128 (sorry)\n"); return -1; } classid = model->n_classes; for (i = 0; i < n_words; ++i) { int32 wid; wid = ngram_add_word_internal(model, words[i], classid); if (wid == NGRAM_INVALID_WID) return -1; if (start_wid == -1) start_wid = NGRAM_BASEWID(wid); classwords = glist_add_float32(classwords, weights[i]); } classwords = glist_reverse(classwords); lmclass = ngram_class_new(model, tag_wid, start_wid, classwords); glist_free(classwords); if (lmclass == NULL) return -1; ++model->n_classes; if (model->classes == NULL) model->classes = ckd_calloc(1, sizeof(*model->classes)); else model->classes = ckd_realloc(model->classes, model->n_classes * sizeof(*model->classes)); model->classes[classid] = lmclass; return classid; }
int ps_add_word(ps_decoder_t *ps, char const *word, char const *phones, int update) { int32 wid; s3cipid_t *pron; hash_iter_t *search_it; char **phonestr, *tmp; int np, i, rv; /* Parse phones into an array of phone IDs. */ tmp = ckd_salloc(phones); np = str2words(tmp, NULL, 0); phonestr = ckd_calloc(np, sizeof(*phonestr)); str2words(tmp, phonestr, np); pron = ckd_calloc(np, sizeof(*pron)); for (i = 0; i < np; ++i) { pron[i] = bin_mdef_ciphone_id(ps->acmod->mdef, phonestr[i]); if (pron[i] == -1) { E_ERROR("Unknown phone %s in phone string %s\n", phonestr[i], tmp); ckd_free(phonestr); ckd_free(tmp); ckd_free(pron); return -1; } } /* No longer needed. */ ckd_free(phonestr); ckd_free(tmp); /* Add it to the dictionary. */ if ((wid = dict_add_word(ps->dict, word, pron, np)) == -1) { ckd_free(pron); return -1; } /* No longer needed. */ ckd_free(pron); /* Now we also have to add it to dict2pid. */ dict2pid_add_word(ps->d2p, wid); /* TODO: we definitely need to refactor this */ for (search_it = hash_table_iter(ps->searches); search_it; search_it = hash_table_iter_next(search_it)) { ps_search_t *search = hash_entry_val(search_it->ent); if (!strcmp(PS_SEARCH_NGRAM, ps_search_name(search))) { ngram_model_t *lmset = ((ngram_search_t *) search)->lmset; if (ngram_model_add_word(lmset, word, 1.0) == NGRAM_INVALID_WID) { hash_table_iter_free(search_it); return -1; } } if (update) { if ((rv = ps_search_reinit(search, ps->dict, ps->d2p) < 0)) { hash_table_iter_free(search_it); return rv; } } } /* Rebuild the widmap and search tree if requested. */ return wid; }