int32 ngram_wid(ngram_model_t * model, const char *word) { int32 val; if (hash_table_lookup_int32(model->wid, word, &val) == -1) return ngram_unknown_wid(model); else return val; }
int32 ngram_model_set_known_wid(ngram_model_t * base, int32 set_wid) { ngram_model_set_t *set = (ngram_model_set_t *) base; if (set_wid >= base->n_words) return FALSE; else if (set->cur == -1) { int32 i; for (i = 0; i < set->n_models; ++i) { if (set->widmap[set_wid][i] != ngram_unknown_wid(set->lms[i])) return TRUE; } return FALSE; } else return (set->widmap[set_wid][set->cur] != ngram_unknown_wid(set->lms[set->cur])); }
int32 ngram_model_add_class(ngram_model_t * model, const char *classname, float32 classweight, char **words, const float32 * weights, int32 n_words) { ngram_class_t *lmclass; glist_t classwords = NULL; int32 i, start_wid = -1; int32 classid, tag_wid; /* Check if classname already exists in model. If not, add it. */ if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) { tag_wid = ngram_model_add_word(model, classname, classweight); if (tag_wid == NGRAM_INVALID_WID) return -1; } if (model->n_classes == 128) { E_ERROR("Number of classes cannot exceed 128 (sorry)\n"); return -1; } classid = model->n_classes; for (i = 0; i < n_words; ++i) { int32 wid; wid = ngram_add_word_internal(model, words[i], classid); if (wid == NGRAM_INVALID_WID) return -1; if (start_wid == -1) start_wid = NGRAM_BASEWID(wid); classwords = glist_add_float32(classwords, weights[i]); } classwords = glist_reverse(classwords); lmclass = ngram_class_new(model, tag_wid, start_wid, classwords); glist_free(classwords); if (lmclass == NULL) return -1; ++model->n_classes; if (model->classes == NULL) model->classes = ckd_calloc(1, sizeof(*model->classes)); else model->classes = ckd_realloc(model->classes, model->n_classes * sizeof(*model->classes)); model->classes[classid] = lmclass; return classid; }
static int calc_entropy(ngram_model_t *lm, char **words, int32 n, int32 *out_n_ccs, int32 *out_n_oovs, int32 *out_lm_score) { int32 *wids; int32 startwid; int32 i, ch, nccs, noovs, unk; if (n == 0) return 0; unk = ngram_unknown_wid(lm); /* Reverse this array into an array of word IDs. */ wids = ckd_calloc(n, sizeof(*wids)); for (i = 0; i < n; ++i) wids[n-i-1] = ngram_wid(lm, words[i]); /* Skip <s> as it's a context cue (HACK, this should be configurable). */ startwid = ngram_wid(lm, "<s>"); /* Now evaluate the list of words in reverse using the * remainder of the array as the history. */ ch = noovs = nccs = 0; for (i = 0; i < n; ++i) { int32 n_used; int32 prob; /* Skip <s> as it's a context cue (HACK, this should be configurable). */ if (wids[i] == startwid) { ++nccs; continue; } /* Skip and count OOVs. */ if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) { ++noovs; continue; } /* Sum up information for each N-gram */ prob = ngram_ng_score(lm, wids[i], wids + i + 1, n - i - 1, &n_used); if (verbose) { int m; printf("log P(%s|", ngram_word(lm, wids[i])); m = i + ngram_model_get_size(lm) - 1; if (m >= n) m = n - 1; while (m > i) { printf("%s ", ngram_word(lm, wids[m--])); } printf(") = %d\n", prob); } ch -= prob; } if (out_n_ccs) *out_n_ccs = nccs; if (out_n_oovs) *out_n_oovs = noovs; /* Calculate cross-entropy CH = - 1/N sum log P(W|H) */ n -= (nccs + noovs); if (n <= 0) return 0; if (out_lm_score) *out_lm_score = -ch; return ch / n; }