static int32 ngram_model_set_score(ngram_model_t * base, int32 wid, int32 * history, int32 n_hist, int32 * n_used) { ngram_model_set_t *set = (ngram_model_set_t *) base; int32 mapwid; int32 score; int32 i; /* Truncate the history. */ if (n_hist > base->n - 1) n_hist = base->n - 1; /* Interpolate if there is no current. */ if (set->cur == -1) { score = base->log_zero; for (i = 0; i < set->n_models; ++i) { int32 j; /* Map word and history IDs for each model. */ mapwid = set->widmap[wid][i]; for (j = 0; j < n_hist; ++j) { if (history[j] == NGRAM_INVALID_WID) set->maphist[j] = NGRAM_INVALID_WID; else set->maphist[j] = set->widmap[history[j]][i]; } score = logmath_add(base->lmath, score, set->lweights[i] + ngram_ng_score(set->lms[i], mapwid, set->maphist, n_hist, n_used)); } } else { int32 j; /* Map word and history IDs (FIXME: do this in a function?) */ mapwid = set->widmap[wid][set->cur]; for (j = 0; j < n_hist; ++j) { if (history[j] == NGRAM_INVALID_WID) set->maphist[j] = NGRAM_INVALID_WID; else set->maphist[j] = set->widmap[history[j]][set->cur]; } score = ngram_ng_score(set->lms[set->cur], mapwid, set->maphist, n_hist, n_used); } return score; }
int32 ngram_score(ngram_model_t * model, const char *word, ...) { va_list history; const char *hword; int32 *histid; int32 n_hist; int32 n_used; int32 prob; va_start(history, word); n_hist = 0; while ((hword = va_arg(history, const char *)) != NULL) ++n_hist; va_end(history); histid = ckd_calloc(n_hist, sizeof(*histid)); va_start(history, word); n_hist = 0; while ((hword = va_arg(history, const char *)) != NULL) { histid[n_hist] = ngram_wid(model, hword); ++n_hist; } va_end(history); prob = ngram_ng_score(model, ngram_wid(model, word), histid, n_hist, &n_used); ckd_free(histid); return prob; }
int32 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used) { int32 hist[2]; hist[0] = w2; hist[1] = w1; return ngram_ng_score(model, w3, hist, 2, n_used); }
static int calc_entropy(ngram_model_t *lm, char **words, int32 n, int32 *out_n_ccs, int32 *out_n_oovs, int32 *out_lm_score) { int32 *wids; int32 startwid; int32 i, ch, nccs, noovs, unk; if (n == 0) return 0; unk = ngram_unknown_wid(lm); /* Reverse this array into an array of word IDs. */ wids = ckd_calloc(n, sizeof(*wids)); for (i = 0; i < n; ++i) wids[n-i-1] = ngram_wid(lm, words[i]); /* Skip <s> as it's a context cue (HACK, this should be configurable). */ startwid = ngram_wid(lm, "<s>"); /* Now evaluate the list of words in reverse using the * remainder of the array as the history. */ ch = noovs = nccs = 0; for (i = 0; i < n; ++i) { int32 n_used; int32 prob; /* Skip <s> as it's a context cue (HACK, this should be configurable). */ if (wids[i] == startwid) { ++nccs; continue; } /* Skip and count OOVs. */ if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) { ++noovs; continue; } /* Sum up information for each N-gram */ prob = ngram_ng_score(lm, wids[i], wids + i + 1, n - i - 1, &n_used); if (verbose) { int m; printf("log P(%s|", ngram_word(lm, wids[i])); m = i + ngram_model_get_size(lm) - 1; if (m >= n) m = n - 1; while (m > i) { printf("%s ", ngram_word(lm, wids[m--])); } printf(") = %d\n", prob); } ch -= prob; } if (out_n_ccs) *out_n_ccs = nccs; if (out_n_oovs) *out_n_oovs = noovs; /* Calculate cross-entropy CH = - 1/N sum log P(W|H) */ n -= (nccs + noovs); if (n <= 0) return 0; if (out_lm_score) *out_lm_score = -ch; return ch / n; }
int32 ngram_bg_score(ngram_model_t * model, int32 w2, int32 w1, int32 * n_used) { return ngram_ng_score(model, w2, &w1, 1, n_used); }