Example #1
0
int32
ngram_wid(ngram_model_t * model, const char *word)
{
    int32 val;

    if (hash_table_lookup_int32(model->wid, word, &val) == -1)
        return ngram_unknown_wid(model);
    else
        return val;
}
int32
ngram_model_set_known_wid(ngram_model_t * base, int32 set_wid)
{
    ngram_model_set_t *set = (ngram_model_set_t *) base;

    if (set_wid >= base->n_words)
        return FALSE;
    else if (set->cur == -1) {
        int32 i;
        for (i = 0; i < set->n_models; ++i) {
            if (set->widmap[set_wid][i] != ngram_unknown_wid(set->lms[i]))
                return TRUE;
        }
        return FALSE;
    }
    else
        return (set->widmap[set_wid][set->cur]
                != ngram_unknown_wid(set->lms[set->cur]));
}
Example #3
0
int32
ngram_model_add_class(ngram_model_t * model,
                      const char *classname,
                      float32 classweight,
                      char **words, const float32 * weights, int32 n_words)
{
    ngram_class_t *lmclass;
    glist_t classwords = NULL;
    int32 i, start_wid = -1;
    int32 classid, tag_wid;

    /* Check if classname already exists in model.  If not, add it. */
    if ((tag_wid =
         ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
        tag_wid = ngram_model_add_word(model, classname, classweight);
        if (tag_wid == NGRAM_INVALID_WID)
            return -1;
    }

    if (model->n_classes == 128) {
        E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
        return -1;
    }
    classid = model->n_classes;
    for (i = 0; i < n_words; ++i) {
        int32 wid;

        wid = ngram_add_word_internal(model, words[i], classid);
        if (wid == NGRAM_INVALID_WID)
            return -1;
        if (start_wid == -1)
            start_wid = NGRAM_BASEWID(wid);
        classwords = glist_add_float32(classwords, weights[i]);
    }
    classwords = glist_reverse(classwords);
    lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
    glist_free(classwords);
    if (lmclass == NULL)
        return -1;

    ++model->n_classes;
    if (model->classes == NULL)
        model->classes = ckd_calloc(1, sizeof(*model->classes));
    else
        model->classes = ckd_realloc(model->classes,
                                     model->n_classes *
                                     sizeof(*model->classes));
    model->classes[classid] = lmclass;
    return classid;
}
static int
calc_entropy(ngram_model_t *lm, char **words, int32 n,
	     int32 *out_n_ccs, int32 *out_n_oovs, int32 *out_lm_score)
{
	int32 *wids;
	int32 startwid;
	int32 i, ch, nccs, noovs, unk;

        if (n == 0)
            return 0;

        unk = ngram_unknown_wid(lm);

	/* Reverse this array into an array of word IDs. */
	wids = ckd_calloc(n, sizeof(*wids));
	for (i = 0; i < n; ++i)
		wids[n-i-1] = ngram_wid(lm, words[i]);
	/* Skip <s> as it's a context cue (HACK, this should be configurable). */
	startwid = ngram_wid(lm, "<s>");

	/* Now evaluate the list of words in reverse using the
	 * remainder of the array as the history. */
	ch = noovs = nccs = 0;
	for (i = 0; i < n; ++i) {
		int32 n_used;
		int32 prob;

		/* Skip <s> as it's a context cue (HACK, this should be configurable). */
		if (wids[i] == startwid) {
			++nccs;
			continue;
		}
		/* Skip and count OOVs. */
		if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) {
			++noovs;
			continue;
		}
		/* Sum up information for each N-gram */
		prob = ngram_ng_score(lm,
				      wids[i], wids + i + 1,
				      n - i - 1, &n_used);
                if (verbose) {
                    int m;
                    printf("log P(%s|", ngram_word(lm, wids[i]));
                    m = i + ngram_model_get_size(lm) - 1;
                    if (m >= n)
                        m = n - 1;
                    while (m > i) {
                        printf("%s ", ngram_word(lm, wids[m--]));
                    }
                    printf(") = %d\n", prob);
                }
		ch -= prob;
	}

	if (out_n_ccs) *out_n_ccs = nccs;
	if (out_n_oovs) *out_n_oovs = noovs;

	/* Calculate cross-entropy CH = - 1/N sum log P(W|H) */
        n -= (nccs + noovs);
        if (n <= 0)
            return 0;
        if (out_lm_score)
            *out_lm_score = -ch;
	return ch / n;
}