int main(int argc, char * argv[]){ SystemTableInfo2 system_table_info; bool retval = system_table_info.load(SYSTEM_TABLE_INFO); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_default_tables(); if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); Bigram bigram; bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY); Bigram deleted_bigram; deleted_bigram.attach(DELETED_BIGRAM, ATTACH_READONLY); GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); deleted_bigram.get_all_items(deleted_items); parameter_t lambda_sum = 0; int lambda_count = 0; for (size_t i = 0; i < deleted_items->len; ++i ){ phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i); SingleGram * single_gram = NULL; bigram.load(*token, single_gram); SingleGram * deleted_single_gram = NULL; deleted_bigram.load(*token, deleted_single_gram); parameter_t lambda = compute_interpolation(deleted_single_gram, &phrase_index, single_gram); printf("token:%d lambda:%f\n", *token, lambda); lambda_sum += lambda; lambda_count ++; if (single_gram) delete single_gram; delete deleted_single_gram; } printf("average lambda:%f\n", (lambda_sum/lambda_count)); g_array_free(deleted_items, TRUE); return 0; }
int main(int argc, char * argv[]){ FacadePhraseIndex phrase_index; //gb_char binary file MemoryChunk * chunk = new MemoryChunk; chunk->load("gb_char.bin"); phrase_index.load(1, chunk); //gbk_char binary file chunk = new MemoryChunk; chunk->load("gbk_char.bin"); phrase_index.load(2, chunk); Bigram bigram; bigram.attach("bigram.db", ATTACH_READONLY); Bigram deleted_bigram; deleted_bigram.attach("deleted_bigram.db", ATTACH_READONLY); GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); deleted_bigram.get_all_items(deleted_items); parameter_t lambda_sum = 0; int lambda_count = 0; for ( int i = 0; i < deleted_items->len; ++i ){ phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i); SingleGram * single_gram = NULL; bigram.load(*token, single_gram); SingleGram * deleted_single_gram = NULL; deleted_bigram.load(*token, deleted_single_gram); parameter_t lambda = compute_interpolation(deleted_single_gram, &phrase_index, single_gram); printf("token:%d lambda:%f\n", *token, lambda); lambda_sum += lambda; lambda_count ++; if (single_gram) delete single_gram; delete deleted_single_gram; } printf("average lambda:%f\n", (lambda_sum/lambda_count)); g_array_free(deleted_items, TRUE); return 0; }
int main(int argc, char * argv[]){ SingleGram single_gram; const guint32 total_freq = 16; assert(single_gram.set_total_freq(total_freq)); phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3}; guint32 freqs[6] = { 1, 2, 4, 8, 16, 32}; guint32 freq; for(size_t i = 0; i < 6 ;++i){ if ( single_gram.get_freq(tokens[i], freq)) assert(single_gram.set_freq(tokens[i], freqs[i])); else assert(single_gram.insert_freq(tokens[i], freqs[i])); } single_gram.get_freq(3, freq); assert(freq == 32); printf("--------------------------------------------------------\n"); PhraseIndexRange range; BigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItem)); range.m_range_begin = 0; range.m_range_end = 8; single_gram.search(&range,array); for ( size_t i = 0; i < array->len; ++i){ BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); printf("item:%d:%f\n", item->m_token, item->m_freq); } assert(single_gram.get_total_freq(freq)); assert(freq == total_freq); Bigram bigram; assert(bigram.attach("/tmp/test.db", ATTACH_CREATE|ATTACH_READWRITE)); bigram.store(1, &single_gram); assert(single_gram.insert_freq(5, 8)); assert(single_gram.remove_freq(1, freq)); single_gram.set_total_freq(32); bigram.store(2, &single_gram); SingleGram * gram = NULL; for ( int m = 1; m <= 2; ++m ){ printf("--------------------------------------------------------\n"); bigram.load(m, gram); g_array_set_size(array, 0); range.m_range_begin = 0; range.m_range_end = 8; gram->search(&range,array); for ( size_t i = 0; i < array->len; ++i){ BigramPhraseItem * item = &g_array_index(array, BigramPhraseItem, i); printf("item:%d:%f\n", item->m_token, item->m_freq); } delete gram; } printf("--------------------------------------------------------\n"); assert(single_gram.get_total_freq(freq)); printf("total_freq:%d\n", freq); g_array_free(array, TRUE); GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); bigram.get_all_items(items); printf("----------------------system----------------------------\n"); for ( size_t i = 0; i < items->len; ++i){ phrase_token_t * token = &g_array_index(items, phrase_token_t, i); printf("item:%d\n", *token); } assert(bigram.load_db("/tmp/test.db")); assert(bigram.save_db("/tmp/test.db")); g_array_free(items, TRUE); /* mask out all index items. */ bigram.mask_out(0x0, 0x0); return 0; }
int main(int argc, char * argv[]){ FILE * input = stdin; setlocale(LC_ALL, ""); GError * error = NULL; GOptionContext * context; context = g_option_context_new("- generate n-gram"); g_option_context_add_main_entries(context, entries, NULL); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } SystemTableInfo system_table_info; bool retval = system_table_info.load(SYSTEM_TABLE_INFO); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } PhraseLargeTable2 phrase_table; /* init phrase table */ MemoryChunk * chunk = new MemoryChunk; chunk->load(SYSTEM_PHRASE_INDEX); phrase_table.load(chunk); FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_table_info(); if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); char* linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; while( getline(&linebuf, &size, input) ){ if ( feof(input) ) break; if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); last_token = cur_token; cur_token = token; /* skip null_token in second word. */ if ( null_token == cur_token ) continue; /* training uni-gram */ phrase_index.add_unigram_frequency(cur_token, 1); /* skip pi-gram training. */ if ( null_token == last_token ){ if ( !train_pi_gram ) continue; last_token = sentence_start; } /* train bi-gram */ SingleGram * single_gram = NULL; bigram.load(last_token, single_gram); if ( NULL == single_gram ){ single_gram = new SingleGram; } guint32 freq, total_freq; /* increase freq */ if (single_gram->get_freq(cur_token, freq)) assert(single_gram->set_freq(cur_token, freq + 1)); else assert(single_gram->insert_freq(cur_token, 1)); /* increase total freq */ single_gram->get_total_freq(total_freq); single_gram->set_total_freq(total_freq + 1); bigram.store(last_token, single_gram); delete single_gram; } free(linebuf); if (!save_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); return 0; }
int main(int argc, char * argv[]){ int i = 1; bool train_pi_gram = true; const char * bigram_filename = "bigram.db"; setlocale(LC_ALL, ""); while ( i < argc ){ if ( strcmp("--help", argv[i]) == 0){ print_help(); exit(0); }else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ train_pi_gram = false; }else if ( strcmp("--bigram-file", argv[i]) == 0){ if ( ++i >= argc ) { print_help(); exit(EINVAL); } bigram_filename = argv[i]; }else{ print_help(); exit(EINVAL); } ++i; } g_phrases = new PhraseLargeTable; //init phrase lookup MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); g_phrases->load(chunk); FacadePhraseIndex phrase_index; //gb_char binary file chunk = new MemoryChunk; chunk->load("gb_char.bin"); phrase_index.load(1, chunk); //gbk_char binary file chunk = new MemoryChunk; chunk->load("gbk_char.bin"); phrase_index.load(2, chunk); Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); char* linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; while( getline(&linebuf, &size, stdin) ){ if ( feof(stdin) ) break; linebuf[strlen(linebuf)-1] = '\0'; glong phrase_len = 0; utf16_t * phrase = g_utf8_to_utf16(linebuf, -1, NULL, &phrase_len, NULL); phrase_token_t token = 0; if ( 0 != phrase_len ) { int result = g_phrases->search( phrase_len, phrase, token); if ( ! (result & SEARCH_OK) ) token = 0; g_free(phrase); phrase = NULL; } last_token = cur_token; cur_token = token; /* skip null_token in second word. */ if ( null_token == cur_token ) continue; //training uni-gram phrase_index.add_unigram_frequency(cur_token, 1); /* skip pi-gram training. */ if ( null_token == last_token ){ if ( !train_pi_gram ) continue; last_token = sentence_start; } //train bi-gram SingleGram * single_gram = NULL; bigram.load(last_token, single_gram); if ( NULL == single_gram ){ single_gram = new SingleGram; } guint32 freq, total_freq; //increase freq if (single_gram->get_freq(cur_token, freq)) assert(single_gram->set_freq(cur_token, freq + 1)); else assert(single_gram->insert_freq(cur_token, 1)); //increase total freq single_gram->get_total_freq(total_freq); single_gram->set_total_freq(total_freq + 1); bigram.store(last_token, single_gram); delete single_gram; } free(linebuf); MemoryChunk * new_chunk = new MemoryChunk; phrase_index.store(1, new_chunk); new_chunk->save("gb_char.bin"); phrase_index.load(1, new_chunk); new_chunk = new MemoryChunk; phrase_index.store(2, new_chunk); new_chunk->save("gbk_char.bin"); phrase_index.load(2, new_chunk); return 0; }
int main(int argc, char * argv[]){ int i = 1; bool train_pi_gram = true; const char * bigram_filename = "bigram.db"; setlocale(LC_ALL, ""); while ( i < argc ){ if ( strcmp("--help", argv[i]) == 0){ print_help(); exit(0); }else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ train_pi_gram = false; }else if ( strcmp("--bigram-file", argv[i]) == 0){ if ( ++i >= argc ) { print_help(); exit(EINVAL); } bigram_filename = argv[i]; }else{ print_help(); exit(EINVAL); } ++i; } PhraseLargeTable2 phrase_table; /* init phrase table */ MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index.prepare_tokens(tokens); char* linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; while( getline(&linebuf, &size, stdin) ){ if ( feof(stdin) ) break; linebuf[strlen(linebuf)-1] = '\0'; glong phrase_len = 0; ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL); phrase_token_t token = null_token; if ( 0 != phrase_len ) { phrase_index.clear_tokens(tokens); int result = phrase_table.search(phrase_len, phrase, tokens); int num = get_first_token(tokens, token); if ( !(result & SEARCH_OK) ) token = null_token; g_free(phrase); phrase = NULL; } last_token = cur_token; cur_token = token; /* skip null_token in second word. */ if ( null_token == cur_token ) continue; /* training uni-gram */ phrase_index.add_unigram_frequency(cur_token, 1); /* skip pi-gram training. */ if ( null_token == last_token ){ if ( !train_pi_gram ) continue; last_token = sentence_start; } /* train bi-gram */ SingleGram * single_gram = NULL; bigram.load(last_token, single_gram); if ( NULL == single_gram ){ single_gram = new SingleGram; } guint32 freq, total_freq; /* increase freq */ if (single_gram->get_freq(cur_token, freq)) assert(single_gram->set_freq(cur_token, freq + 1)); else assert(single_gram->insert_freq(cur_token, 1)); /* increase total freq */ single_gram->get_total_freq(total_freq); single_gram->set_total_freq(total_freq + 1); bigram.store(last_token, single_gram); delete single_gram; } phrase_index.destroy_tokens(tokens); free(linebuf); if (!save_phrase_index(&phrase_index)) exit(ENOENT); return 0; }
int main(int argc, char * argv[]){ int i = 1; bool train_pi_gram = true; const char * bigram_filename = "deleted_bigram.db"; setlocale(LC_ALL, ""); while ( i < argc ){ if ( strcmp("--help", argv[i]) == 0){ print_help(); exit(0); } else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ train_pi_gram = false; } else if ( strcmp("--deleted-bigram-file", argv[i]) == 0){ if ( ++i >= argc ) { print_help(); exit(EINVAL); } bigram_filename = argv[i]; } else { print_help(); exit(EINVAL); } ++i; } /* load phrase table. */ PhraseLargeTable2 phrase_table; MemoryChunk * new_chunk = new MemoryChunk; new_chunk->load("phrase_index.bin"); phrase_table.load(new_chunk); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENODATA); Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); char* linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; while( getline(&linebuf, &size, stdin) ){ if ( feof(stdin) ) break; if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); last_token = cur_token; cur_token = token; /* skip null_token in second word. */ if ( null_token == cur_token ) continue; /* skip pi-gram training. */ if ( null_token == last_token ){ if ( !train_pi_gram ) continue; last_token = sentence_start; } /* train bi-gram */ SingleGram * single_gram = NULL; bigram.load(last_token, single_gram); if ( NULL == single_gram ){ single_gram = new SingleGram; } guint32 freq, total_freq; //increase freq if (single_gram->get_freq(cur_token, freq)) assert(single_gram->set_freq(cur_token, freq + 1)); else assert(single_gram->insert_freq(cur_token, 1)); //increase total freq single_gram->get_total_freq(total_freq); single_gram->set_total_freq(total_freq + 1); bigram.store(last_token, single_gram); delete single_gram; } free(linebuf); return 0; }