예제 #1
0
int main(int argc, char * argv[]){
    FILE * input = stdin;

    setlocale(LC_ALL, "");

    GError * error = NULL;
    GOptionContext * context;

    context = g_option_context_new("- generate n-gram");
    g_option_context_add_main_entries(context, entries, NULL);
    if (!g_option_context_parse(context, &argc, &argv, &error)) {
        g_print("option parsing failed:%s\n", error->message);
        exit(EINVAL);
    }

    SystemTableInfo system_table_info;

    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    PhraseLargeTable2 phrase_table;
    /* init phrase table */
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load(SYSTEM_PHRASE_INDEX);
    phrase_table.load(chunk);

    FacadePhraseIndex phrase_index;

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_table_info();

    if (!load_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);
    
    Bigram bigram;
    bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);

    char* linebuf = NULL; size_t size = 0;
    phrase_token_t last_token, cur_token = last_token = 0;
    while( getline(&linebuf, &size, input) ){
	if ( feof(input) )
	    break;

        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);

	last_token = cur_token;
	cur_token = token;

        /* skip null_token in second word. */
        if ( null_token == cur_token )
            continue;

        /* training uni-gram */
        phrase_index.add_unigram_frequency(cur_token, 1);

        /* skip pi-gram training. */
        if ( null_token == last_token ){
            if ( !train_pi_gram )
                continue;
            last_token = sentence_start;
        }

        /* train bi-gram */
        SingleGram * single_gram = NULL;
        bigram.load(last_token, single_gram);

        if ( NULL == single_gram ){
            single_gram = new SingleGram;
        }
        guint32 freq, total_freq;
        /* increase freq */
        if (single_gram->get_freq(cur_token, freq))
            assert(single_gram->set_freq(cur_token, freq + 1));
        else
            assert(single_gram->insert_freq(cur_token, 1));
        /* increase total freq */
        single_gram->get_total_freq(total_freq);
        single_gram->set_total_freq(total_freq + 1);

        bigram.store(last_token, single_gram);
        delete single_gram;
    }

    free(linebuf);
    
    if (!save_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);

    return 0;
}
예제 #2
0
bool read_document(PhraseLargeTable2 * phrase_table,
                   FacadePhraseIndex * phrase_index,
                   FILE * document,
                   HashofDocument hash_of_document,
                   HashofUnigram hash_of_unigram){

    char * linebuf = NULL;size_t size = 0;
    phrase_token_t last_token, cur_token = last_token = 0;

    while ( getline(&linebuf, &size, document) ){
        if ( feof(document) )
            break;

        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf);

        last_token = cur_token;
        cur_token = token;

        /* skip null_token in second word. */
        if ( null_token == cur_token )
            continue;

        gpointer value = NULL;
        gboolean lookup_result = g_hash_table_lookup_extended
            (hash_of_unigram, GUINT_TO_POINTER(cur_token),
             NULL, &value);
        if ( !lookup_result ){
            g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token),
                                GUINT_TO_POINTER(1));
        } else {
            guint32 freq = GPOINTER_TO_UINT(value);
            freq ++;
            g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token),
                                GUINT_TO_POINTER(freq));
        }

        /* skip pi-gram training. */
        if ( null_token == last_token ){
            if ( !g_train_pi_gram )
                continue;
            last_token = sentence_start;
        }

        /* remember the (last_token, cur_token) word pair. */
        HashofSecondWord hash_of_second_word = NULL;
        lookup_result = g_hash_table_lookup_extended
            (hash_of_document, GUINT_TO_POINTER(last_token),
             NULL, &value);
        if ( !lookup_result ){
            hash_of_second_word = g_hash_table_new
                (g_direct_hash, g_direct_equal);
        } else {
            hash_of_second_word = (HashofSecondWord) value;
        }

        value = NULL;
        lookup_result = g_hash_table_lookup_extended
            (hash_of_second_word, GUINT_TO_POINTER(cur_token),
             NULL, &value);
        guint32 count = 0;
        if ( lookup_result ) {
            count = GPOINTER_TO_UINT(value);
        }
        count ++;
        g_hash_table_insert(hash_of_second_word,
                            GUINT_TO_POINTER(cur_token),
                            GUINT_TO_POINTER(count));
        g_hash_table_insert(hash_of_document,
                            GUINT_TO_POINTER(last_token),
                            hash_of_second_word);
    }

    free(linebuf);

    return true;
}
예제 #3
0
int main(int argc, char * argv[]){
    int i = 1;
    bool train_pi_gram = true;
    const char * bigram_filename = "deleted_bigram.db";

    setlocale(LC_ALL, "");
    while ( i < argc ){
	if ( strcmp("--help", argv[i]) == 0){
	    print_help();
            exit(0);
	} else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){
	    train_pi_gram = false;
	} else if ( strcmp("--deleted-bigram-file", argv[i]) == 0){
            if ( ++i >= argc ) {
                print_help();
                exit(EINVAL);
            }
            bigram_filename = argv[i];
	} else {
            print_help();
            exit(EINVAL);
        }
	++i;
    }
    
    /* load phrase table. */
    PhraseLargeTable2 phrase_table;
    MemoryChunk * new_chunk = new MemoryChunk;
    new_chunk->load("phrase_index.bin");
    phrase_table.load(new_chunk);

    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(&phrase_index))
        exit(ENODATA);

    Bigram bigram;
    bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);

    char* linebuf = NULL; size_t size = 0;
    phrase_token_t last_token, cur_token = last_token = 0;
    while( getline(&linebuf, &size, stdin) ){
	if ( feof(stdin) )
	    break;

        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);

	last_token = cur_token;
	cur_token = token;

        /* skip null_token in second word. */
        if ( null_token == cur_token )
            continue;

        /* skip pi-gram training. */
        if ( null_token == last_token ){
            if ( !train_pi_gram )
                continue;
            last_token = sentence_start;
        }

        /* train bi-gram */
        SingleGram * single_gram = NULL;
        bigram.load(last_token, single_gram);

        if ( NULL == single_gram ){
            single_gram = new SingleGram;
        }
        guint32 freq, total_freq;
        //increase freq
        if (single_gram->get_freq(cur_token, freq))
            assert(single_gram->set_freq(cur_token, freq + 1));
        else
            assert(single_gram->insert_freq(cur_token, 1));
        //increase total freq
        single_gram->get_total_freq(total_freq);
        single_gram->set_total_freq(total_freq + 1);
        
        bigram.store(last_token, single_gram);
        delete single_gram;
    }

    free(linebuf);
    return 0;
}