bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) {
    fprintf(output, "\\1-gram\n");
    for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; i++) {

        PhraseIndexRange range;
        int result = phrase_index->get_range(i, range);
        if (ERROR_OK != result )
            continue;

        PhraseItem item;
        for (phrase_token_t token = range.m_range_begin;
              token < range.m_range_end; token++) {
            int result = phrase_index->get_phrase_item(token, item);

            if ( result == ERROR_NO_ITEM )
                continue;
            assert( result == ERROR_OK);

            size_t freq = item.get_unigram_frequency();
            if ( 0 == freq )
                continue;
            char * phrase = taglib_token_to_string(phrase_index, token);
            if ( phrase )
                fprintf(output, "\\item %d %s count %ld\n", token, phrase, freq);

            g_free(phrase);
        }
    }
    return true;
}
bool gen_bigram(FILE * output, FacadePhraseIndex * phrase_index, Bigram * bigram){
    fprintf(output, "\\2-gram\n");

    /* Retrieve all user items. */
    GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));

    bigram->get_all_items(items);

    PhraseItem item;

    for(size_t i = 0; i < items->len; i++){
        phrase_token_t token = g_array_index(items, phrase_token_t, i);
        SingleGram * single_gram = NULL;
        bigram->load(token, single_gram);

        BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
        single_gram->retrieve_all(array);
        for(size_t j = 0; j < array->len; j++) {
            BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, j);

            char * word1 = taglib_token_to_string(phrase_index, token);
            char * word2 = taglib_token_to_string(phrase_index, item->m_token);
            guint32 freq = item->m_count;

            if ( word1 && word2)
                fprintf(output, "\\item %d %s %d %s count %d\n",
                        token, word1, item->m_token, word2, freq);

            g_free(word1); g_free(word2);
        }

        g_array_free(array, TRUE);
        delete single_gram;
    }

    g_array_free(items, TRUE);
    return true;
}
Пример #3
0
bool print_k_mixture_model_array_items(FILE * output,
                                       KMixtureModelBigram * bigram,
                                       FacadePhraseIndex * phrase_index){
    fprintf(output, "\\2-gram\n");
    GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
    bigram->get_all_items(items);

    for (size_t i = 0; i < items->len; ++i) {
        phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
        KMixtureModelSingleGram * single_gram = NULL;
        assert(bigram->load(*token, single_gram));
        FlexibleBigramPhraseArray array = g_array_new
            (FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
        single_gram->retrieve_all(array);

        for (size_t m = 0; m < array->len; ++m){
            KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m);
            char * word1 = taglib_token_to_string(phrase_index, *token);
            char * word2 = taglib_token_to_string(phrase_index, item->m_token);

            if (word1 && word2)
                fprintf(output, "\\item %s %s count %d T %d N_n_0 %d n_1 %d Mr %d\n",
                        word1, word2, item->m_item.m_WC, item->m_item.m_WC,
                        item->m_item.m_N_n_0, item->m_item.m_n_1,
                        item->m_item.m_Mr);

            g_free(word1); g_free(word2);
        }

        g_array_free(array, TRUE);
        delete single_gram;
    }

    g_array_free(items, TRUE);
    return true;
}
Пример #4
0
bool print_k_mixture_model_array_headers(FILE * output,
                                         KMixtureModelBigram * bigram,
                                         FacadePhraseIndex * phrase_index){
    fprintf(output, "\\1-gram\n");
    GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
    bigram->get_all_items(items);

    for (size_t i = 0; i < items->len; ++i) {
        phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
        KMixtureModelArrayHeader array_header;
        assert(bigram->get_array_header(*token, array_header));
        char * phrase = taglib_token_to_string(phrase_index, *token);
        if ( phrase )
            fprintf(output, "\\item %s count %d freq %d\n",
                    phrase, array_header.m_WC, array_header.m_freq);

        g_free(phrase);
    }
    return true;
}