int
main(int argc, char *argv[])
{
	logmath_t *lmath;
	ngram_model_t *model;

	/* Initialize a logmath object to pass to ngram_read */
	lmath = logmath_init(1.0001, 0, 0);

	/* Read a language model */
	model = ngram_model_read(NULL, LMDIR "/100.arpa.DMP", NGRAM_BIN, lmath);
	TEST_ASSERT(model);

	ngram_model_casefold(model, NGRAM_UPPER);

	TEST_EQUAL(0, strcmp("</s>", ngram_word(model, 5)));
	TEST_EQUAL(0, strcmp("BE", ngram_word(model, 42)));
	TEST_EQUAL(0, strcmp("FLOORED", ngram_word(model, 130)));
	TEST_EQUAL(0, strcmp("ZERO", ngram_word(model, 398)));
	TEST_EQUAL(0, strcmp("~", ngram_word(model, 399)));

	ngram_model_casefold(model, NGRAM_LOWER);

	TEST_EQUAL(0, strcmp("</s>", ngram_word(model, 5)));
	TEST_EQUAL(0, strcmp("be", ngram_word(model, 42)));
	TEST_EQUAL(0, strcmp("floored", ngram_word(model, 130)));
	TEST_EQUAL(0, strcmp("zero", ngram_word(model, 398)));
	TEST_EQUAL(0, strcmp("~", ngram_word(model, 399)));

	ngram_model_free(model);
	logmath_free(lmath);

	return 0;
}
Example #2
0
void
vithist_dump(vithist_t * vh, int32 frm, ngram_model_t *lm, s3dict_t *dict, FILE * fp)
{
    int32 i, j;
    vithist_entry_t *ve;
    int32 sf, ef;

    if (frm >= 0) {
        sf = frm;
        ef = frm;

        fprintf(fp, "VITHIST  frame %d  #entries %d\n",
                frm, vh->frame_start[sf + 1] - vh->frame_start[sf]);
    }
    else {
        sf = 0;
        ef = vh->n_frm - 1;

        fprintf(fp, "VITHIST  #frames %d  #entries %d\n", vh->n_frm,
                vh->n_entry);
    }
    fprintf(fp, "\t%7s %5s %5s %11s %9s %8s %7s %4s Word (LM-state)\n",
            "Seq/Val", "SFrm", "EFrm", "PathScr", "SegAScr", "SegLScr",
            "Pred", "Type");

    for (i = sf; i <= ef; i++) {
        fprintf(fp, "%5d BS: %11d BV: %8d\n", i, vh->bestscore[i],
                vh->bestvh[i]);

        for (j = vh->frame_start[i]; j < vh->frame_start[i + 1]; j++) {
            int32 lwid;
            ve = vithist_id2entry(vh, j);

            fprintf(fp, "\t%c%6d %5d %5d %11d %9d %8d %7d %4d %s",
                    (ve->valid ? ' ' : '*'), j,
                    ve->sf, ve->ef, ve->path.score, ve->ascr, ve->lscr,
                    ve->path.pred, ve->type, s3dict_wordstr(dict, ve->wid));

            fprintf(fp, " (%s", ngram_word(lm, ve->lmstate.lm3g.lwid[0]));
            lwid = ve->lmstate.lm3g.lwid[1];
            fprintf(fp, ", %s", ngram_word(lm, lwid));
            fprintf(fp, ")\n");
        }

        if (j == vh->frame_start[i])
            fprintf(fp, "\n");
    }

    fprintf(fp, "END_VITHIST\n");
    fflush(fp);
}
Example #3
0
char *unwind_phoneme(ngram_model_t *model, tree_element_t *tree_element) {
    int32 i, j, size = 0;
    char* phoneme;
    const char* unigram_phoneme;
    tree_element_t *element = tree_element;

    while (element) {
        unigram_phoneme = strstr(ngram_word(model, element->wid), "}") + 1;
        if (strcmp(unigram_phoneme, "_") != 0) {
            size += strlen(unigram_phoneme) + 1;
        }
        element = element->parent;
    }

    phoneme = ckd_malloc(size);
    phoneme[size - 1] = '\0';
    i = size - 2;

    element = tree_element;
    while (element) {
        unigram_phoneme = strstr(ngram_word(model, element->wid), "}") + 1;
        if (strcmp(unigram_phoneme, "_") != 0) {
            i -= strlen(unigram_phoneme);
            j = i + 1;
            while (*unigram_phoneme) {
                phoneme[j] = *unigram_phoneme == '|' ? ' ' : *unigram_phoneme;
                j++;
                unigram_phoneme++;
            }
            if (i >= 0) {
                phoneme[i] = ' ';
            }
            i--;
        }
        element = element->parent;
    }
    return phoneme;
}
Example #4
0
static int
test_lm_vals(ngram_model_t *model)
{
	int32 n_used;
	TEST_ASSERT(model);
	TEST_EQUAL(ngram_wid(model, "<UNK>"), 0);
	TEST_EQUAL(strcmp(ngram_word(model, 0), "<UNK>"), 0);
	TEST_EQUAL(ngram_wid(model, "absolute"), 13);
	TEST_EQUAL(strcmp(ngram_word(model, 13), "absolute"), 0);
	/* Test unigrams. */
	TEST_EQUAL(ngram_score(model, "<UNK>", NULL), -75346);
	TEST_EQUAL(ngram_bg_score(model, ngram_wid(model, "<UNK>"),
				  NGRAM_INVALID_WID, &n_used), -75346);
	TEST_EQUAL(n_used, 1);
	TEST_EQUAL(ngram_score(model, "sphinxtrain", NULL), -64208);
	TEST_EQUAL(ngram_bg_score(model, ngram_wid(model, "sphinxtrain"),
				  NGRAM_INVALID_WID, &n_used), -64208);
	TEST_EQUAL(n_used, 1);
	/* Test bigrams. */
	TEST_EQUAL(ngram_score(model, "huggins", "david", NULL), -831);
	/* Test trigrams. */
	TEST_EQUAL_LOG(ngram_score(model, "daines", "huggins", "david", NULL), -9450);
	return 0;
}
Example #5
0
struct winner_t
dict_get_winner_wid(ngram_model_t *model, const char * word_grapheme, glist_t history_list, const int32 total_unigrams,
                    int word_offset)
{
    int32 current_prob = -2147483647;
    struct winner_t winner;
    int32 i = 0, j = 0;
    int nused;
    int32 ngram_order = ngram_model_get_size(model);
    int32 *history = ckd_calloc((size_t)ngram_order+1, sizeof(int32));
    gnode_t *gn;
    const char *vocab;
    const char *sub;
    int32 prob;
    unigram_t unigram;

    for (gn = history_list; gn; gn = gnode_next(gn)) {
        history[ngram_order-j] = gnode_int32(gn);
        j++;
        if (j >= ngram_order)
            break;
    }

    for (i = 0; i < total_unigrams; i++) {
        vocab = ngram_word(model, i);
        unigram  = dict_split_unigram(vocab);
        sub = word_grapheme + word_offset;
        if (dict_starts_with(unigram.word, sub)){
            prob = ngram_ng_prob(model, i, history, j, &nused);
            if (current_prob < prob) {
                current_prob = prob;
                winner.winner_wid = i;
                winner.length_match = strlen(unigram.word);
                winner.len_phoneme = strlen(unigram.phone);
            }
        }

        if (unigram.word)
            ckd_free(unigram.word);
        if (unigram.phone)
            ckd_free(unigram.phone);
    }

    if (history)
        ckd_free(history);

    return winner;
}
Example #6
0
char *g2p(ngram_model_t *model, char *grapheme, uint32 level_count_limit) {
    int32 i, j, n, wid, fit_count;
    array_heap_t **tree_table;
    const char* unigram_text;
    char* phoneme;
    int32 *history_buffer;
    int32 start_wid, end_wid;
    const uint32 total_unigrams = *ngram_model_get_counts(model);

    n = strlen(grapheme);
    tree_table = ckd_calloc(n + 1, sizeof(array_heap_t *));
    for (i = 0; i < n; i++) {
        tree_table[i] = array_heap_new(level_count_limit);
    }
    tree_table[n] = array_heap_new(1);
    history_buffer = ckd_calloc(n + 1, sizeof(int32));
    start_wid = ngram_wid(model, "<s>");
    end_wid = ngram_wid(model, "</s>");

    for (i = 0; i < n; i++) {
        for (wid = 0; wid < total_unigrams; wid++) {
            unigram_text = ngram_word(model, wid);
            fit_count = graphemes_fit_count(grapheme, i, unigram_text);
            if (fit_count != 0) {
                try_add_tree_elements(model, wid, i == 0 ? NULL : tree_table[i - 1], tree_table[i + fit_count - 1],
                        history_buffer, start_wid);
            }
        }

    }

    try_add_tree_elements(model, end_wid, tree_table[n - 1], tree_table[n], history_buffer, start_wid);

    phoneme = (tree_table[n]->size == 0) ? NULL : unwind_phoneme(model,
            ((tree_element_t*) array_heap_element(tree_table[n], 0))->parent);

    for (i = 0; i <= n; i++) {
        for (j = 0; j < tree_table[i]->size; j++) {
            ckd_free(array_heap_element(tree_table[i], j));
        }
        array_heap_free(tree_table[i]);
    }

    ckd_free(tree_table);
    ckd_free(history_buffer);
    return phoneme;
}
static int
calc_entropy(ngram_model_t *lm, char **words, int32 n,
	     int32 *out_n_ccs, int32 *out_n_oovs, int32 *out_lm_score)
{
	int32 *wids;
	int32 startwid;
	int32 i, ch, nccs, noovs, unk;

        if (n == 0)
            return 0;

        unk = ngram_unknown_wid(lm);

	/* Reverse this array into an array of word IDs. */
	wids = ckd_calloc(n, sizeof(*wids));
	for (i = 0; i < n; ++i)
		wids[n-i-1] = ngram_wid(lm, words[i]);
	/* Skip <s> as it's a context cue (HACK, this should be configurable). */
	startwid = ngram_wid(lm, "<s>");

	/* Now evaluate the list of words in reverse using the
	 * remainder of the array as the history. */
	ch = noovs = nccs = 0;
	for (i = 0; i < n; ++i) {
		int32 n_used;
		int32 prob;

		/* Skip <s> as it's a context cue (HACK, this should be configurable). */
		if (wids[i] == startwid) {
			++nccs;
			continue;
		}
		/* Skip and count OOVs. */
		if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) {
			++noovs;
			continue;
		}
		/* Sum up information for each N-gram */
		prob = ngram_ng_score(lm,
				      wids[i], wids + i + 1,
				      n - i - 1, &n_used);
                if (verbose) {
                    int m;
                    printf("log P(%s|", ngram_word(lm, wids[i]));
                    m = i + ngram_model_get_size(lm) - 1;
                    if (m >= n)
                        m = n - 1;
                    while (m > i) {
                        printf("%s ", ngram_word(lm, wids[m--]));
                    }
                    printf(") = %d\n", prob);
                }
		ch -= prob;
	}

	if (out_n_ccs) *out_n_ccs = nccs;
	if (out_n_oovs) *out_n_oovs = noovs;

	/* Calculate cross-entropy CH = - 1/N sum log P(W|H) */
        n -= (nccs + noovs);
        if (n <= 0)
            return 0;
        if (out_lm_score)
            *out_lm_score = -ch;
	return ch / n;
}
Example #8
0
char *
dict_g2p(char const *word_grapheme, ngram_model_t *ngram_g2p_model) 
{
    char *final_phone = NULL;
    int totalh = 0;
    size_t increment = 1;
    int word_offset = 0;
    int j;
    size_t grapheme_len = 0, final_phoneme_len = 0;
    glist_t history_list = NULL;
    gnode_t *gn;
    int first = 0;
    const int32 *total_unigrams;
    struct winner_t winner;
    const char *word;
    unigram_t unigram;

    total_unigrams = ngram_model_get_counts(ngram_g2p_model);
    int32 wid_sentence = ngram_wid(ngram_g2p_model,"<s>"); // start with sentence
    history_list = glist_add_int32(history_list, wid_sentence);
    grapheme_len = strlen(word_grapheme);
    for (j = 0; j < grapheme_len; j += increment) {
        winner = dict_get_winner_wid(ngram_g2p_model, word_grapheme, history_list, *total_unigrams, word_offset);
        increment = winner.length_match;
        if (increment == 0) {
            E_ERROR("Error trying to find matching phoneme (%s) Exiting.. \n" , word_grapheme);
            return NULL;
        }
        history_list = glist_add_int32(history_list, winner.winner_wid);
        totalh = j + 1;
        word_offset += winner.length_match;
        final_phoneme_len += winner.len_phoneme;
    }

    history_list = glist_reverse(history_list);
    final_phone = ckd_calloc(1, final_phoneme_len * 2);
    for (gn = history_list; gn; gn = gnode_next(gn)) {
        if (!first) {
            first = 1;
            continue;
        }
        word = ngram_word(ngram_g2p_model, gnode_int32(gn));

        if (!word)
            continue;

        unigram  = dict_split_unigram(word);

        if (strcmp(unigram.phone, "_") == 0) {
            if (unigram.word)
                ckd_free(unigram.word);
            if (unigram.phone)
                ckd_free(unigram.phone);
            continue;
        }
        strcat(final_phone, unigram.phone);
        strcat(final_phone, " ");

        if (unigram.word)
            ckd_free(unigram.word);
        if (unigram.phone)
            ckd_free(unigram.phone);
    }

    if (history_list)
        glist_free(history_list);

    return final_phone;
}
Example #9
0
void
run_tests(logmath_t *lmath, ngram_model_t *model)
{
	int32 rv, i;

	TEST_ASSERT(model);

	TEST_EQUAL(ngram_wid(model, "scylla"), 285);
	TEST_EQUAL(strcmp(ngram_word(model, 285), "scylla"), 0);

	rv = ngram_model_read_classdef(model, LMDIR "/100.probdef");
	TEST_EQUAL(rv, 0);

	/* Verify that class word IDs remain the same. */
	TEST_EQUAL(ngram_wid(model, "scylla"), 285);
	TEST_EQUAL(strcmp(ngram_word(model, 285), "scylla"), 0);

	/* Verify in-class word IDs. */
	TEST_EQUAL(ngram_wid(model, "scylla:scylla"), 0x80000000 | 400);

	/* Verify in-class and out-class unigram scores. */
	TEST_EQUAL_LOG(ngram_score(model, "scylla:scylla", NULL),
		       logmath_log10_to_log(lmath, -2.7884) + logmath_log(lmath, 0.4));
	TEST_EQUAL_LOG(ngram_score(model, "scooby:scylla", NULL),
		       logmath_log10_to_log(lmath, -2.7884) + logmath_log(lmath, 0.1));
	TEST_EQUAL_LOG(ngram_score(model, "scylla", NULL),
		       logmath_log10_to_log(lmath, -2.7884));
	TEST_EQUAL_LOG(ngram_score(model, "oh:zero", NULL),
		       logmath_log10_to_log(lmath, -1.9038) + logmath_log(lmath, 0.7));
	TEST_EQUAL_LOG(ngram_score(model, "zero", NULL),
		       logmath_log10_to_log(lmath, -1.9038));

	/* Verify class bigram scores. */
	TEST_EQUAL_LOG(ngram_score(model, "scylla", "on", NULL),
		       logmath_log10_to_log(lmath, -1.2642));
	TEST_EQUAL_LOG(ngram_score(model, "scylla:scylla", "on", NULL),
		       logmath_log10_to_log(lmath, -1.2642) + logmath_log(lmath, 0.4));
	TEST_EQUAL_LOG(ngram_score(model, "apparently", "scylla", NULL),
		       logmath_log10_to_log(lmath, -0.5172));
	TEST_EQUAL_LOG(ngram_score(model, "apparently", "karybdis:scylla", NULL),
		       logmath_log10_to_log(lmath, -0.5172));
	TEST_EQUAL_LOG(ngram_score(model, "apparently", "scooby:scylla", NULL),
		       logmath_log10_to_log(lmath, -0.5172));

	/* Verify class trigram scores. */
	TEST_EQUAL_LOG(ngram_score(model, "zero", "be", "will", NULL),
		       logmath_log10_to_log(lmath, -0.5725));
	TEST_EQUAL_LOG(ngram_score(model, "oh:zero", "be", "will", NULL),
		       logmath_log10_to_log(lmath, -0.5725) + logmath_log(lmath, 0.7));
	TEST_EQUAL_LOG(ngram_score(model, "should", "variance", "zero", NULL),
		       logmath_log10_to_log(lmath, -0.9404));
	TEST_EQUAL_LOG(ngram_score(model, "should", "variance", "zero:zero", NULL),
		       logmath_log10_to_log(lmath, -0.9404));

	/* Add words to classes. */
	rv = ngram_model_add_class_word(model, "scylla", "scrappy:scylla", 1.0);
	TEST_ASSERT(rv >= 0);
	TEST_EQUAL(ngram_wid(model, "scrappy:scylla"), 0x80000196);
	TEST_EQUAL_LOG(ngram_score(model, "scrappy:scylla", NULL),
		       logmath_log10_to_log(lmath, -2.7884) + logmath_log(lmath, 0.2));
	printf("scrappy:scylla %08x %d %f\n", 
	       ngram_wid(model, "scrappy:scylla"),
	       ngram_score(model, "scrappy:scylla", NULL),
	       logmath_exp(lmath, ngram_score(model, "scrappy:scylla", NULL)));
	/* Add a lot of words to a class. */
	for (i = 0; i < 129; ++i) {
		char word[32];
		sprintf(word, "%d:scylla", i);
		rv = ngram_model_add_class_word(model, "scylla", word, 1.0);
		printf("%s %08x %d %f\n", word,
		       ngram_wid(model, word),
		       ngram_score(model, word, NULL),
		       logmath_exp(lmath, ngram_score(model, word, NULL)));
		TEST_ASSERT(rv >= 0);
		TEST_EQUAL(ngram_wid(model, word), 0x80000197 + i);
	}

	/* Add a new class. */
	{
		const char *words[] = { "blatz:foobie", "hurf:foobie" };
		float32 weights[] = { 0.6, 0.4 };
		int32 foobie_prob;
		rv = ngram_model_add_class(model, "[foobie]", 1.0,
					   words, weights, 2);
		TEST_ASSERT(rv >= 0);
		foobie_prob = ngram_score(model, "[foobie]", NULL);
		TEST_EQUAL_LOG(ngram_score(model, "blatz:foobie", NULL),
			       foobie_prob + logmath_log(lmath, 0.6));
		TEST_EQUAL_LOG(ngram_score(model, "hurf:foobie", NULL),
			       foobie_prob + logmath_log(lmath, 0.4));
	}
}