Beispiel #1
0
struct winner_t
dict_get_winner_wid(ngram_model_t *model, const char * word_grapheme, glist_t history_list, const int32 total_unigrams,
                    int word_offset)
{
    int32 current_prob = -2147483647;
    struct winner_t winner;
    int32 i = 0, j = 0;
    int nused;
    int32 ngram_order = ngram_model_get_size(model);
    int32 *history = ckd_calloc((size_t)ngram_order+1, sizeof(int32));
    gnode_t *gn;
    const char *vocab;
    const char *sub;
    int32 prob;
    unigram_t unigram;

    for (gn = history_list; gn; gn = gnode_next(gn)) {
        history[ngram_order-j] = gnode_int32(gn);
        j++;
        if (j >= ngram_order)
            break;
    }

    for (i = 0; i < total_unigrams; i++) {
        vocab = ngram_word(model, i);
        unigram  = dict_split_unigram(vocab);
        sub = word_grapheme + word_offset;
        if (dict_starts_with(unigram.word, sub)){
            prob = ngram_ng_prob(model, i, history, j, &nused);
            if (current_prob < prob) {
                current_prob = prob;
                winner.winner_wid = i;
                winner.length_match = strlen(unigram.word);
                winner.len_phoneme = strlen(unigram.phone);
            }
        }

        if (unigram.word)
            ckd_free(unigram.word);
        if (unigram.phone)
            ckd_free(unigram.phone);
    }

    if (history)
        ckd_free(history);

    return winner;
}
static int
calc_entropy(ngram_model_t *lm, char **words, int32 n,
	     int32 *out_n_ccs, int32 *out_n_oovs, int32 *out_lm_score)
{
	int32 *wids;
	int32 startwid;
	int32 i, ch, nccs, noovs, unk;

        if (n == 0)
            return 0;

        unk = ngram_unknown_wid(lm);

	/* Reverse this array into an array of word IDs. */
	wids = ckd_calloc(n, sizeof(*wids));
	for (i = 0; i < n; ++i)
		wids[n-i-1] = ngram_wid(lm, words[i]);
	/* Skip <s> as it's a context cue (HACK, this should be configurable). */
	startwid = ngram_wid(lm, "<s>");

	/* Now evaluate the list of words in reverse using the
	 * remainder of the array as the history. */
	ch = noovs = nccs = 0;
	for (i = 0; i < n; ++i) {
		int32 n_used;
		int32 prob;

		/* Skip <s> as it's a context cue (HACK, this should be configurable). */
		if (wids[i] == startwid) {
			++nccs;
			continue;
		}
		/* Skip and count OOVs. */
		if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) {
			++noovs;
			continue;
		}
		/* Sum up information for each N-gram */
		prob = ngram_ng_score(lm,
				      wids[i], wids + i + 1,
				      n - i - 1, &n_used);
                if (verbose) {
                    int m;
                    printf("log P(%s|", ngram_word(lm, wids[i]));
                    m = i + ngram_model_get_size(lm) - 1;
                    if (m >= n)
                        m = n - 1;
                    while (m > i) {
                        printf("%s ", ngram_word(lm, wids[m--]));
                    }
                    printf(") = %d\n", prob);
                }
		ch -= prob;
	}

	if (out_n_ccs) *out_n_ccs = nccs;
	if (out_n_oovs) *out_n_oovs = noovs;

	/* Calculate cross-entropy CH = - 1/N sum log P(W|H) */
        n -= (nccs + noovs);
        if (n <= 0)
            return 0;
        if (out_lm_score)
            *out_lm_score = -ch;
	return ch / n;
}