Пример #1
0
int main(int argc, char * argv[]){
    int i = 1;
    const char * k_mixture_model_filename = NULL;

    setlocale(LC_ALL, "");
    while ( i < argc ){
        if ( strcmp("--help", argv[i]) == 0 ){
            print_help();
            exit(0);
        } else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){
            g_train_pi_gram = false;
        } else if ( strcmp("--maximum-occurs-allowed", argv[i]) == 0 ){
            if ( ++i >= argc ){
                print_help();
                exit(EINVAL);
            }
            g_maximum_occurs = atoi(argv[i]);
        } else if ( strcmp("--maximum-increase-rates-allowed", argv[i]) == 0 ){
            if ( ++i >= argc ){
                print_help();
                exit(EINVAL);
            }
            g_maximum_increase_rates = atof(argv[i]);
        } else if ( strcmp("--k-mixture-model-file", argv[i]) == 0 ){
            if ( ++i >= argc ){
                print_help();
                exit(EINVAL);
            }
            k_mixture_model_filename = argv[i];
        } else {
            break;
        }
        ++i;
    }

    PhraseLargeTable2 phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("phrase_index.bin");
    phrase_table.load(chunk);

    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(&phrase_index))
        exit(ENOENT);

    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);

    while ( i < argc ){
        const char * filename = argv[i];
        FILE * document = fopen(filename, "r");
        if ( NULL == document ){
            int err_saved = errno;
            fprintf(stderr, "can't open file: %s.\n", filename);
            fprintf(stderr, "error:%s.\n", strerror(err_saved));
            exit(err_saved);
        }

        HashofDocument hash_of_document = g_hash_table_new
            (g_direct_hash, g_direct_equal);
        HashofUnigram hash_of_unigram = g_hash_table_new
            (g_direct_hash, g_direct_equal);

        assert(read_document(&phrase_table, &phrase_index, document,
                             hash_of_document, hash_of_unigram));
        fclose(document);
        document = NULL;

        GHashTableIter iter;
        gpointer key, value;

        /* train the document, and convert it to k mixture model. */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            phrase_token_t token1 = GPOINTER_TO_UINT(key);
            train_second_word(hash_of_unigram, &bigram,
                              hash_of_document, token1);
        }

        KMixtureModelMagicHeader magic_header;
        assert(bigram.get_magic_header(magic_header));
        magic_header.m_N ++;
        assert(bigram.set_magic_header(magic_header));

        post_processing_unigram(&bigram, hash_of_unigram);

        /* free resources of g_hash_of_document */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            HashofSecondWord second_word = (HashofSecondWord) value;
            g_hash_table_iter_steal(&iter);
            g_hash_table_unref(second_word);
        }
        g_hash_table_unref(hash_of_document);
        hash_of_document = NULL;

        g_hash_table_unref(hash_of_unigram);
        hash_of_unigram = NULL;

        ++i;
    }

    return 0;
}
Пример #2
0
int main(int argc, char * argv[]){
    int i = 1;

    setlocale(LC_ALL, "");

    GError * error = NULL;
    GOptionContext * context;

    context = g_option_context_new("- generate k mixture model");
    g_option_context_add_main_entries(context, entries, NULL);
    if (!g_option_context_parse(context, &argc, &argv, &error)) {
        g_print("option parsing failed:%s\n", error->message);
        exit(EINVAL);
    }

    SystemTableInfo2 system_table_info;

    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    PhraseLargeTable3 phrase_table;
    phrase_table.attach(SYSTEM_PHRASE_INDEX, ATTACH_READONLY);

    FacadePhraseIndex phrase_index;

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_default_tables();

    if (!load_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);

    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    bigram.attach(g_k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);

    while ( i < argc ){
        const char * filename = argv[i];
        FILE * document = fopen(filename, "r");
        if ( NULL == document ){
            int err_saved = errno;
            fprintf(stderr, "can't open file: %s.\n", filename);
            fprintf(stderr, "error:%s.\n", strerror(err_saved));
            exit(err_saved);
        }

        HashofDocument hash_of_document = g_hash_table_new
            (g_direct_hash, g_direct_equal);
        HashofUnigram hash_of_unigram = g_hash_table_new
            (g_direct_hash, g_direct_equal);

        assert(read_document(&phrase_table, &phrase_index, document,
                             hash_of_document, hash_of_unigram));
        fclose(document);
        document = NULL;

        GHashTableIter iter;
        gpointer key, value;

        /* train the document, and convert it to k mixture model. */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            phrase_token_t token1 = GPOINTER_TO_UINT(key);
            train_second_word(hash_of_unigram, &bigram,
                              hash_of_document, token1);
        }

        KMixtureModelMagicHeader magic_header;
        assert(bigram.get_magic_header(magic_header));
        magic_header.m_N ++;
        assert(bigram.set_magic_header(magic_header));

        post_processing_unigram(&bigram, hash_of_unigram);

        /* free resources of g_hash_of_document */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            HashofSecondWord second_word = (HashofSecondWord) value;
            g_hash_table_iter_steal(&iter);
            g_hash_table_unref(second_word);
        }
        g_hash_table_unref(hash_of_document);
        hash_of_document = NULL;

        g_hash_table_unref(hash_of_unigram);
        hash_of_unigram = NULL;

        ++i;
    }

    return 0;
}