static int32 ngram_model_set_raw_score(ngram_model_t * base, int32 wid, int32 * history, int32 n_hist, int32 * n_used) { ngram_model_set_t *set = (ngram_model_set_t *) base; int32 mapwid; int32 score; int32 i; /* Truncate the history. */ if (n_hist > base->n - 1) n_hist = base->n - 1; /* Interpolate if there is no current. */ if (set->cur == -1) { score = base->log_zero; for (i = 0; i < set->n_models; ++i) { int32 j; /* Map word and history IDs for each model. */ mapwid = set->widmap[wid][i]; for (j = 0; j < n_hist; ++j) { if (history[j] == NGRAM_INVALID_WID) set->maphist[j] = NGRAM_INVALID_WID; else set->maphist[j] = set->widmap[history[j]][i]; } score = logmath_add(base->lmath, score, set->lweights[i] + ngram_ng_prob(set->lms[i], mapwid, set->maphist, n_hist, n_used)); } } else { int32 j; /* Map word and history IDs (FIXME: do this in a function?) */ mapwid = set->widmap[wid][set->cur]; for (j = 0; j < n_hist; ++j) { if (history[j] == NGRAM_INVALID_WID) set->maphist[j] = NGRAM_INVALID_WID; else set->maphist[j] = set->widmap[history[j]][set->cur]; } score = ngram_ng_prob(set->lms[set->cur], mapwid, set->maphist, n_hist, n_used); } return score; }
int32 ngram_probv(ngram_model_t * model, const char *word, ...) { va_list history; const char *hword; int32 *histid; int32 n_hist; int32 n_used; int32 prob; va_start(history, word); n_hist = 0; while ((hword = va_arg(history, const char *)) != NULL) ++n_hist; va_end(history); histid = ckd_calloc(n_hist, sizeof(*histid)); va_start(history, word); n_hist = 0; while ((hword = va_arg(history, const char *)) != NULL) { histid[n_hist] = ngram_wid(model, hword); ++n_hist; } va_end(history); prob = ngram_ng_prob(model, ngram_wid(model, word), histid, n_hist, &n_used); ckd_free(histid); return prob; }
static int32 ngram_model_set_add_ug(ngram_model_t * base, int32 wid, int32 lweight) { ngram_model_set_t *set = (ngram_model_set_t *) base; int32 *newwid; int32 i, prob; /* At this point the word has already been added to the master model and we have a new word ID for it. Add it to active submodels and track the word IDs. */ newwid = ckd_calloc(set->n_models, sizeof(*newwid)); prob = base->log_zero; for (i = 0; i < set->n_models; ++i) { int32 wprob, n_hist; /* Only add to active models. */ if (set->cur == -1 || set->cur == i) { /* Did this word already exist? */ newwid[i] = ngram_wid(set->lms[i], base->word_str[wid]); if (newwid[i] == NGRAM_INVALID_WID) { /* Add it to the submodel. */ newwid[i] = ngram_model_add_word(set->lms[i], base->word_str[wid], (float32) logmath_exp(base->lmath, lweight)); if (newwid[i] == NGRAM_INVALID_WID) { ckd_free(newwid); return base->log_zero; } } /* Now get the unigram probability for the new word and either * interpolate it or use it (if this is the current model). */ wprob = ngram_ng_prob(set->lms[i], newwid[i], NULL, 0, &n_hist); if (set->cur == i) prob = wprob; else if (set->cur == -1) prob = logmath_add(base->lmath, prob, set->lweights[i] + wprob); } else { newwid[i] = NGRAM_INVALID_WID; } } /* Okay we have the word IDs for this in all the submodels. Now do some complicated memory mangling to add this to the widmap. */ set->widmap = ckd_realloc(set->widmap, base->n_words * sizeof(*set->widmap)); set->widmap[0] = ckd_realloc(set->widmap[0], base->n_words * set->n_models * sizeof(**set->widmap)); for (i = 0; i < base->n_words; ++i) set->widmap[i] = set->widmap[0] + i * set->n_models; memcpy(set->widmap[wid], newwid, set->n_models * sizeof(*newwid)); ckd_free(newwid); return prob; }
void try_add_tree_element(ngram_model_t *model, int32 wid, int32 *history, int32 history_size, tree_element_t *tree_element_from, array_heap_t *heap) { int32 nused; int32 probability = ngram_ng_prob(model, wid, history, history_size, &nused); if (tree_element_from) { probability += tree_element_from->probability; } if (!array_heap_full(heap)) { array_heap_add(heap, probability, tree_element_new(wid, probability, tree_element_from)); } else if (array_heap_min_key(heap) < probability) { ckd_free(array_heap_pop(heap)); array_heap_add(heap, probability, tree_element_new(wid, probability, tree_element_from)); } }
struct winner_t dict_get_winner_wid(ngram_model_t *model, const char * word_grapheme, glist_t history_list, const int32 total_unigrams, int word_offset) { int32 current_prob = -2147483647; struct winner_t winner; int32 i = 0, j = 0; int nused; int32 ngram_order = ngram_model_get_size(model); int32 *history = ckd_calloc((size_t)ngram_order+1, sizeof(int32)); gnode_t *gn; const char *vocab; const char *sub; int32 prob; unigram_t unigram; for (gn = history_list; gn; gn = gnode_next(gn)) { history[ngram_order-j] = gnode_int32(gn); j++; if (j >= ngram_order) break; } for (i = 0; i < total_unigrams; i++) { vocab = ngram_word(model, i); unigram = dict_split_unigram(vocab); sub = word_grapheme + word_offset; if (dict_starts_with(unigram.word, sub)){ prob = ngram_ng_prob(model, i, history, j, &nused); if (current_prob < prob) { current_prob = prob; winner.winner_wid = i; winner.length_match = strlen(unigram.word); winner.len_phoneme = strlen(unigram.phone); } } if (unigram.word) ckd_free(unigram.word); if (unigram.phone) ckd_free(unigram.phone); } if (history) ckd_free(history); return winner; }
int32 ngram_prob(ngram_model_t * model, const char *const *words, int32 n) { int32 *ctx_id; int32 nused; int32 prob; int32 wid; uint32 i; ctx_id = (int32 *) ckd_calloc(n - 1, sizeof(*ctx_id)); for (i = 1; i < (uint32) n; ++i) ctx_id[i - 1] = ngram_wid(model, words[i]); wid = ngram_wid(model, *words); prob = ngram_ng_prob(model, wid, ctx_id, n - 1, &nused); ckd_free(ctx_id); return prob; }