int main(int argc, char * argv[]){ FILE * output = stdout; const char * bigram_filename = "bigram.db"; FacadePhraseIndex phrase_index; //gb_char binary file MemoryChunk * chunk = new MemoryChunk; chunk->load("gb_char.bin"); phrase_index.load(1, chunk); //gbk_char binary file chunk = new MemoryChunk; chunk->load("gbk_char.bin"); phrase_index.load(2, chunk); Bigram bigram; bigram.attach(bigram_filename, ATTACH_READONLY); begin_data(output); gen_unigram(output, &phrase_index); gen_bigram(output, &phrase_index, &bigram); end_data(output); return 0; }
int main(int argc, char * argv[]){ FacadePhraseIndex phrase_index; /* gb_char binary file */ MemoryChunk * chunk = new MemoryChunk; bool retval = chunk->load("gb_char.bin"); if (!retval) { fprintf(stderr, "open gb_char.bin failed!\n"); exit(ENOENT); } phrase_index.load(1, chunk); /* gbk_char binary file */ chunk = new MemoryChunk; retval = chunk->load("gbk_char.bin"); if (!retval) { fprintf(stderr, "open gbk_char.bin failed!\n"); exit(ENOENT); } phrase_index.load(2, chunk); /* Note: please increase the value when corpus size becomes larger. * To avoid zero value when computing unigram frequency in float format. */ guint32 freq = 1; PhraseIndexRange range; int result = phrase_index.get_range(1, range); if ( result == ERROR_OK ) { for ( size_t i = range.m_range_begin; i <= range.m_range_end; ++i ) { phrase_index.add_unigram_frequency(i, freq); } } #if 1 result = phrase_index.get_range(2, range); if ( result == ERROR_OK ) { for ( size_t i = range.m_range_begin; i <= range.m_range_end; ++i ) { phrase_index.add_unigram_frequency(i, freq); } } #endif MemoryChunk * new_chunk = new MemoryChunk; phrase_index.store(1, new_chunk); new_chunk->save("gb_char.bin"); phrase_index.load(1, new_chunk); new_chunk = new MemoryChunk; phrase_index.store(2, new_chunk); new_chunk->save("gbk_char.bin"); phrase_index.load(2, new_chunk); return 0; }
int main(int argc, char * argv[]){ FacadePhraseIndex phrase_index; //gb_char binary file MemoryChunk * chunk = new MemoryChunk; chunk->load("gb_char.bin"); phrase_index.load(1, chunk); //gbk_char binary file chunk = new MemoryChunk; chunk->load("gbk_char.bin"); phrase_index.load(2, chunk); Bigram bigram; bigram.attach("bigram.db", ATTACH_READONLY); Bigram deleted_bigram; deleted_bigram.attach("deleted_bigram.db", ATTACH_READONLY); GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); deleted_bigram.get_all_items(deleted_items); parameter_t lambda_sum = 0; int lambda_count = 0; for ( int i = 0; i < deleted_items->len; ++i ){ phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i); SingleGram * single_gram = NULL; bigram.load(*token, single_gram); SingleGram * deleted_single_gram = NULL; deleted_bigram.load(*token, deleted_single_gram); parameter_t lambda = compute_interpolation(deleted_single_gram, &phrase_index, single_gram); printf("token:%d lambda:%f\n", *token, lambda); lambda_sum += lambda; lambda_count ++; if (single_gram) delete single_gram; delete deleted_single_gram; } printf("average lambda:%f\n", (lambda_sum/lambda_count)); g_array_free(deleted_items, TRUE); return 0; }
int main(int argc, char * argv[]){ int i = 1; const char * k_mixture_model_filename = NULL; FILE * output = stdout; while ( i < argc ){ if ( strcmp ("--help", argv[i]) == 0 ){ print_help(); exit(0); } else if ( strcmp ("--k-mixture-model-file", argv[i]) == 0 ){ if ( ++i > argc ){ print_help(); exit(EINVAL); } k_mixture_model_filename = argv[i]; } else { print_help(); exit(EINVAL); } ++i; } FacadePhraseIndex phrase_index; //gb_char binary file MemoryChunk * chunk = new MemoryChunk; chunk->load("gb_char.bin"); phrase_index.load(1, chunk); //gbk_char binary file chunk = new MemoryChunk; chunk->load("gbk_char.bin"); phrase_index.load(2, chunk); KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(k_mixture_model_filename, ATTACH_READONLY); print_k_mixture_model_magic_header(output, &bigram); print_k_mixture_model_array_headers(output, &bigram, &phrase_index); print_k_mixture_model_array_items(output, &bigram, &phrase_index); end_data(output); return 0; }
int main( int argc, char * argv[]){ SystemTableInfo system_table_info; bool retval = system_table_info.load("../../data/table.conf"); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } pinyin_option_t options = USE_TONE; FacadeChewingTable largetable; MemoryChunk * chunk = new MemoryChunk; chunk->load("../../data/pinyin_index.bin"); largetable.load(options, chunk, NULL); const pinyin_table_info_t * phrase_files = system_table_info.get_table_info(); FacadePhraseIndex phrase_index; if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); Bigram system_bigram; system_bigram.attach("../../data/bigram.db", ATTACH_READONLY); Bigram user_bigram; user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE); gfloat lambda = system_table_info.get_lambda(); PinyinLookup2 pinyin_lookup(lambda, options, &largetable, &phrase_index, &system_bigram, &user_bigram); /* prepare the prefixes for get_best_match. */ TokenVector prefixes = g_array_new (FALSE, FALSE, sizeof(phrase_token_t)); g_array_append_val(prefixes, sentence_start); CandidateConstraints constraints = g_array_new (TRUE, FALSE, sizeof(lookup_constraint_t)); MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); char* linebuf = NULL; size_t size = 0; ssize_t read; while( (read = getline(&linebuf, &size, stdin)) != -1 ){ if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } if ( strcmp ( linebuf, "quit" ) == 0) break; FullPinyinParser2 parser; ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); ChewingKeyRestVector key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); parser.parse(options, keys, key_rests, linebuf, strlen(linebuf)); if ( 0 == keys->len ) /* invalid pinyin */ continue; /* initialize constraints. */ g_array_set_size(constraints, keys->len); for ( size_t i = 0; i < constraints->len; ++i){ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); constraint->m_type = NO_CONSTRAINT; } guint32 start_time = record_time(); for ( size_t i = 0; i < bench_times; ++i) pinyin_lookup.get_best_match(prefixes, keys, constraints, results); print_time(start_time, bench_times); for ( size_t i = 0; i < results->len; ++i){ phrase_token_t * token = &g_array_index(results, phrase_token_t, i); if ( null_token == *token) continue; printf("pos:%ld,token:%d\t", i, *token); } printf("\n"); char * sentence = NULL; pinyin_lookup.convert_to_utf8(results, sentence); printf("%s\n", sentence); g_array_free(keys, TRUE); g_array_free(key_rests, TRUE); g_free(sentence); } g_array_free(prefixes, TRUE); g_array_free(constraints, TRUE); g_array_free(results, TRUE); free(linebuf); return 0; }
int main(int argc, char * argv[]){ FILE * input = stdin; const char * bigram_filename = "bigram.db"; PhraseLargeTable phrases; MemoryChunk * chunk = new MemoryChunk; bool retval = chunk->load("phrase_index.bin"); if (!retval) { fprintf(stderr, "open phrase_index.bin failed!\n"); exit(ENOENT); } phrases.load(chunk); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); Bigram bigram; retval = bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); if (!retval) { fprintf(stderr, "open %s failed!\n", bigram_filename); exit(ENOENT); } taglib_init(); values = g_ptr_array_new(); required = g_hash_table_new(g_str_hash, g_str_equal); //enter "\data" line assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", "")); ssize_t result = my_getline(input); if ( result == -1 ) { fprintf(stderr, "empty file input.\n"); exit(ENODATA); } //read "\data" line if ( !taglib_read(linebuf, line_type, values, required) ) { fprintf(stderr, "error: interpolation model expected.\n"); exit(ENODATA); } assert(line_type == BEGIN_LINE); char * value = NULL; assert(g_hash_table_lookup_extended(required, "model", NULL, (gpointer *)&value)); if ( !( strcmp("interpolation", value) == 0 ) ) { fprintf(stderr, "error: interpolation model expected.\n"); exit(ENODATA); } result = my_getline(input); if ( result != -1 ) parse_body(input, &phrases, &phrase_index, &bigram); taglib_fini(); if (!save_phrase_index(&phrase_index)) exit(ENOENT); return 0; }
int main(int argc, char * argv[]){ MemoryChunk* chunk; chunk = new MemoryChunk(); int i = 12; chunk->set_content(0, &i, sizeof(int)); int * p = (int *)chunk->begin(); assert(chunk->size() == sizeof(int)); std::cout<<*p<<std::endl; std::cout<<chunk->capacity()<<std::endl; p = & i; chunk->set_chunk(p, sizeof(int), NULL); short t = 5; chunk->set_content(sizeof(int), &t, sizeof(short)); assert( sizeof(int) + sizeof(short) == chunk->size()); std::cout<<chunk->capacity()<<std::endl; p = (int *)chunk->begin(); short * p2 =(short *)(((char *) (chunk->begin())) + sizeof(int)); std::cout<<*p<<'\t'<<*p2<<std::endl; chunk->set_content(sizeof(int) + sizeof(short), &t, sizeof(short)); assert( sizeof(int) + (sizeof(short) << 1) == chunk->size()); std::cout<<chunk->capacity()<<std::endl; p = (int *)chunk->begin(); p2 =(short *)(((char *) (chunk->begin())) + sizeof(int)); std::cout<<*p<<'\t'<<*p2<<'\t'<<*(p2 + 1)<<std::endl; chunk->set_size(sizeof(int) + sizeof(short) *3); p = (int *)chunk->begin(); p2 =(short *)(((char *) (chunk->begin())) + sizeof(int)); chunk->set_content(0, &i, sizeof(int)); *(p2+2) = 3; std::cout<<*p<<'\t'<<*p2<<'\t'<<*(p2 + 1)<<'\t'<<*(p2+2)<<std::endl; int m = 10; chunk->set_chunk(&m, sizeof(int), NULL); int n = 12; chunk->insert_content(sizeof(int), &n, sizeof(int)); n = 11; chunk->insert_content(sizeof(int), &n, sizeof(int)); int * p3 = (int *)chunk->begin(); std::cout<<*p3<<'\t'<<*(p3+1)<<'\t'<<*(p3+2)<<std::endl; chunk->remove_content(sizeof(int), sizeof(int)); std::cout<<*p3<<'\t'<<*(p3+1)<<std::endl; int tmp; assert(chunk->get_content(sizeof(int), &tmp, sizeof(int))); std::cout<<tmp<<std::endl; delete chunk; const char * filename = "/tmp/version"; const char * version = "0.2.0"; chunk = new MemoryChunk; bool retval = chunk->load(filename); if ( !retval ){ std::cerr<<"can't find chunk"<<std::endl; }else{ if ( memcmp(version, chunk->begin(), strlen(version) + 1) == 0){ std::cout<<"match"<<std::endl; } } chunk->set_content(0, version, strlen(version) + 1); chunk->save(filename); retval = chunk->load(filename); if ( !retval ){ std::cerr<<"can't find chunk"<<std::endl; } if ( memcmp(version, chunk->begin(), strlen(version) + 1) == 0){ std::cout<<"match"<<std::endl; } return 0; }
int main(int argc, char * argv[]){ int i = 1; bool gen_extra_enter = false; setlocale(LC_ALL, ""); //deal with options. while ( i < argc ){ if ( strcmp ("--help", argv[i]) == 0 ){ print_help(); exit(0); } else if ( strcmp("--generate-extra-enter", argv[i]) == 0 ){ gen_extra_enter = true; } else { print_help(); exit(EINVAL); } ++i; } //init phrase table PhraseLargeTable phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk); //init phrase index FacadePhraseIndex phrase_index; chunk = new MemoryChunk; chunk->load("gb_char.bin"); phrase_index.load(1, chunk); chunk = new MemoryChunk; chunk->load("gbk_char.bin"); phrase_index.load(2, chunk); //init bi-gram Bigram system_bigram; system_bigram.attach("bigram.db", ATTACH_READONLY); Bigram user_bigram; //init phrase lookup g_phrase_lookup = new PhraseLookup(&phrase_table, &phrase_index, &system_bigram, &user_bigram); CONTEXT_STATE state, next_state; GArray * current_utf16 = g_array_new(TRUE, TRUE, sizeof(utf16_t)); phrase_token_t token = null_token; //split the sentence char * linebuf = NULL; size_t size = 0; ssize_t read; while( (read = getline(&linebuf, &size, stdin)) != -1 ){ if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } //check non-ucs2 characters const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; utf16_t * sentence = g_utf8_to_utf16(linebuf, -1, NULL, &len, NULL); if ( len != num_of_chars ) { fprintf(stderr, "non-ucs2 characters encountered:%s.\n", linebuf); printf("\n"); continue; } /* only new-line persists. */ if ( 0 == num_of_chars ) { printf("\n"); continue; } state = CONTEXT_INIT; bool result = phrase_table.search( 1, sentence, token); g_array_append_val( current_utf16, sentence[0]); if ( result & SEARCH_OK ) state = CONTEXT_SEGMENTABLE; else state = CONTEXT_UNKNOWN; for ( int i = 1; i < num_of_chars; ++i) { bool result = phrase_table.search( 1, sentence + i, token); if ( result & SEARCH_OK ) next_state = CONTEXT_SEGMENTABLE; else next_state = CONTEXT_UNKNOWN; if ( state == next_state ){ g_array_append_val(current_utf16, sentence[i]); continue; } assert ( state != next_state ); if ( state == CONTEXT_SEGMENTABLE ) deal_with_segmentable(current_utf16); if ( state == CONTEXT_UNKNOWN ) deal_with_unknown(current_utf16); /* save the current character */ g_array_set_size(current_utf16, 0); g_array_append_val(current_utf16, sentence[i]); state = next_state; } if ( current_utf16->len ) { /* this seems always true. */ if ( state == CONTEXT_SEGMENTABLE ) deal_with_segmentable(current_utf16); if ( state == CONTEXT_UNKNOWN ) deal_with_unknown(current_utf16); g_array_set_size(current_utf16, 0); } /* print extra enter */ if ( gen_extra_enter ) printf("\n"); } delete g_phrase_lookup; g_phrase_lookup = NULL; /* print enter at file tail */ printf("\n"); g_array_free(current_utf16, TRUE); free(linebuf); return 0; }
int main(int argc, char * argv[]){ const char * evals_text = "evals.text"; pinyin_option_t options = USE_TONE; FacadeChewingTable largetable; MemoryChunk * chunk = new MemoryChunk; chunk->load("pinyin_index.bin"); largetable.load(options, chunk, NULL); FacadePhraseTable2 phrase_table; chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk, NULL); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); Bigram system_bigram; system_bigram.attach("bigram.db", ATTACH_READONLY); Bigram user_bigram; user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE); PinyinLookup2 pinyin_lookup(options, &largetable, &phrase_index, &system_bigram, &user_bigram); /* open evals.text. */ FILE * evals_file = fopen(evals_text, "r"); if ( NULL == evals_file ) { fprintf(stderr, "Can't open file:%s\n", evals_text); exit(ENOENT); } PhraseTokens phrase_tokens; memset(phrase_tokens, 0, sizeof(PhraseTokens)); phrase_index.prepare_tokens(phrase_tokens); /* Evaluates the correction rate of test text documents. */ size_t tested_count = 0; size_t passed_count = 0; char* linebuf = NULL; size_t size = 0; TokenVector tokens = g_array_new(FALSE, TRUE, sizeof(phrase_token_t)); phrase_token_t token = null_token; while( getline(&linebuf, &size, evals_file) ) { if ( feof(evals_file) ) break; if ( '\n' == linebuf[strlen(linebuf)-1] ) linebuf[strlen(linebuf)-1] = '\0'; glong phrase_len = 0; ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL); token = null_token; if ( 0 != phrase_len ) { int result = phrase_table.search(phrase_len, phrase, phrase_tokens); int num = get_first_token(phrase_tokens, token); if ( !(result & SEARCH_OK) ) token = null_token; g_free(phrase); phrase = NULL; } if ( null_token == token ) { if ( tokens->len ) { /* one test. */ if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) { tested_count ++; passed_count ++; } else { tested_count ++; } g_array_set_size(tokens, 0); } } else { g_array_append_val(tokens, token); } } if ( tokens->len ) { /* one test. */ if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) { tested_count ++; passed_count ++; } else { tested_count ++; } } parameter_t rate = passed_count / (parameter_t) tested_count; printf("correction rate:%f\n", rate); g_array_free(tokens, TRUE); fclose(evals_file); free(linebuf); phrase_index.destroy_tokens(phrase_tokens); return 0; }
int main(int argc, char * argv[]){ int i = 1; bool train_pi_gram = true; const char * bigram_filename = "bigram.db"; setlocale(LC_ALL, ""); while ( i < argc ){ if ( strcmp("--help", argv[i]) == 0){ print_help(); exit(0); }else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ train_pi_gram = false; }else if ( strcmp("--bigram-file", argv[i]) == 0){ if ( ++i >= argc ) { print_help(); exit(EINVAL); } bigram_filename = argv[i]; }else{ print_help(); exit(EINVAL); } ++i; } g_phrases = new PhraseLargeTable; //init phrase lookup MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); g_phrases->load(chunk); FacadePhraseIndex phrase_index; //gb_char binary file chunk = new MemoryChunk; chunk->load("gb_char.bin"); phrase_index.load(1, chunk); //gbk_char binary file chunk = new MemoryChunk; chunk->load("gbk_char.bin"); phrase_index.load(2, chunk); Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); char* linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; while( getline(&linebuf, &size, stdin) ){ if ( feof(stdin) ) break; linebuf[strlen(linebuf)-1] = '\0'; glong phrase_len = 0; utf16_t * phrase = g_utf8_to_utf16(linebuf, -1, NULL, &phrase_len, NULL); phrase_token_t token = 0; if ( 0 != phrase_len ) { int result = g_phrases->search( phrase_len, phrase, token); if ( ! (result & SEARCH_OK) ) token = 0; g_free(phrase); phrase = NULL; } last_token = cur_token; cur_token = token; /* skip null_token in second word. */ if ( null_token == cur_token ) continue; //training uni-gram phrase_index.add_unigram_frequency(cur_token, 1); /* skip pi-gram training. */ if ( null_token == last_token ){ if ( !train_pi_gram ) continue; last_token = sentence_start; } //train bi-gram SingleGram * single_gram = NULL; bigram.load(last_token, single_gram); if ( NULL == single_gram ){ single_gram = new SingleGram; } guint32 freq, total_freq; //increase freq if (single_gram->get_freq(cur_token, freq)) assert(single_gram->set_freq(cur_token, freq + 1)); else assert(single_gram->insert_freq(cur_token, 1)); //increase total freq single_gram->get_total_freq(total_freq); single_gram->set_total_freq(total_freq + 1); bigram.store(last_token, single_gram); delete single_gram; } free(linebuf); MemoryChunk * new_chunk = new MemoryChunk; phrase_index.store(1, new_chunk); new_chunk->save("gb_char.bin"); phrase_index.load(1, new_chunk); new_chunk = new MemoryChunk; phrase_index.store(2, new_chunk); new_chunk->save("gbk_char.bin"); phrase_index.load(2, new_chunk); return 0; }
int main(int argc, char * argv[]){ int i = 1; bool gen_extra_enter = false; setlocale(LC_ALL, ""); //deal with options. while ( i < argc ){ if ( strcmp ("--help", argv[i]) == 0) { print_help(); exit(0); } else if (strcmp("--generate-extra-enter", argv[i]) == 0) { gen_extra_enter = true; } else { print_help(); exit(EINVAL); } ++i; } /* init phrase table */ FacadePhraseTable2 phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk, NULL); /* init phrase index */ FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); char * linebuf = NULL; size_t size = 0; ssize_t read; while( (read = getline(&linebuf, &size, stdin)) != -1 ){ if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } //check non-ucs4 characters const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); if ( len != num_of_chars ) { fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf); printf("\n"); continue; } //do segment stuff GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep)); segment(&phrase_table, &phrase_index, sentence, len, strings); //print out the split phrase for ( glong i = 0; i < strings->len; ++i ) { SegmentStep * step = &g_array_index(strings, SegmentStep, i); char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL); printf("%d %s\n", step->m_handle, string); g_free(string); } /* print extra enter */ if ( gen_extra_enter ) printf("\n"); g_array_free(strings, TRUE); g_free(sentence); } /* print enter at file tail */ printf("\n"); return 0; }
int main(int argc, char * argv[]){ int i = 1; setlocale(LC_ALL, ""); //deal with options. while ( i < argc ){ if ( strcmp ("--help", argv[i]) == 0 ){ print_help(); exit(0); } else { print_help(); exit(EINVAL); } ++i; } //init phrase table PhraseLargeTable phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("../../data/phrase_index.bin"); phrase_table.load(chunk); //init phrase index FacadePhraseIndex phrase_index; chunk = new MemoryChunk; chunk->load("../../data/gb_char.bin"); phrase_index.load(1, chunk); chunk = new MemoryChunk; chunk->load("../../data/gbk_char.bin"); phrase_index.load(2, chunk); //init bi-gram Bigram system_bigram; system_bigram.attach("../../data/bigram.db", ATTACH_READONLY); Bigram user_bigram; //init phrase lookup PhraseLookup phrase_lookup(&phrase_table, &phrase_index, &system_bigram, &user_bigram); //try one sentence char * linebuf = NULL; size_t size = 0; ssize_t read; while( (read = getline(&linebuf, &size, stdin)) != -1 ){ if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } if ( strcmp ( linebuf, "quit" ) == 0) break; //check non-ucs2 characters const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; utf16_t * sentence = g_utf8_to_utf16(linebuf, -1, NULL, &len, NULL); if ( len != num_of_chars ) { fprintf(stderr, "non-ucs2 characters are not accepted.\n"); g_free(sentence); continue; } try_phrase_lookup(&phrase_lookup, sentence, len); g_free(sentence); } free(linebuf); return 0; }
int main(int argc, char * argv[]){ FILE * input = stdin; FILE * output = stdout; setlocale(LC_ALL, ""); GError * error = NULL; GOptionContext * context; context = g_option_context_new("- n-gram segment"); g_option_context_add_main_entries(context, entries, NULL); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } if (outputfile) { output = fopen(outputfile, "w"); if (NULL == output) { perror("open file failed"); exit(EINVAL); } } if (argc > 2) { fprintf(stderr, "too many arguments.\n"); exit(EINVAL); } if (2 == argc) { input = fopen(argv[1], "r"); if (NULL == input) { perror("open file failed"); exit(EINVAL); } } SystemTableInfo system_table_info; bool retval = system_table_info.load(SYSTEM_TABLE_INFO); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } /* init phrase table */ FacadePhraseTable2 phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load(SYSTEM_PHRASE_INDEX); phrase_table.load(chunk, NULL); /* init phrase index */ FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_table_info(); if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); /* init bi-gram */ Bigram system_bigram; system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY); Bigram user_bigram; gfloat lambda = system_table_info.get_lambda(); /* init phrase lookup */ PhraseLookup phrase_lookup(lambda, &phrase_table, &phrase_index, &system_bigram, &user_bigram); CONTEXT_STATE state, next_state; GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index.prepare_tokens(tokens); /* split the sentence */ char * linebuf = NULL; size_t size = 0; ssize_t read; while( (read = getline(&linebuf, &size, input)) != -1 ){ if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } /* check non-ucs4 characters */ const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); if ( len != num_of_chars ) { fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf); fprintf(output, "%d \n", null_token); continue; } /* only new-line persists. */ if ( 0 == num_of_chars ) { fprintf(output, "%d \n", null_token); continue; } state = CONTEXT_INIT; int result = phrase_table.search( 1, sentence, tokens); g_array_append_val( current_ucs4, sentence[0]); if ( result & SEARCH_OK ) state = CONTEXT_SEGMENTABLE; else state = CONTEXT_UNKNOWN; for ( int i = 1; i < num_of_chars; ++i) { int result = phrase_table.search( 1, sentence + i, tokens); if ( result & SEARCH_OK ) next_state = CONTEXT_SEGMENTABLE; else next_state = CONTEXT_UNKNOWN; if ( state == next_state ){ g_array_append_val(current_ucs4, sentence[i]); continue; } assert ( state != next_state ); if ( state == CONTEXT_SEGMENTABLE ) deal_with_segmentable(&phrase_lookup, current_ucs4, output); if ( state == CONTEXT_UNKNOWN ) deal_with_unknown(current_ucs4, output); /* save the current character */ g_array_set_size(current_ucs4, 0); g_array_append_val(current_ucs4, sentence[i]); state = next_state; } if ( current_ucs4->len ) { /* this seems always true. */ if ( state == CONTEXT_SEGMENTABLE ) deal_with_segmentable(&phrase_lookup, current_ucs4, output); if ( state == CONTEXT_UNKNOWN ) deal_with_unknown(current_ucs4, output); g_array_set_size(current_ucs4, 0); } /* print extra enter */ if ( gen_extra_enter ) fprintf(output, "%d \n", null_token); g_free(sentence); } phrase_index.destroy_tokens(tokens); /* print enter at file tail */ fprintf(output, "%d \n", null_token); g_array_free(current_ucs4, TRUE); free(linebuf); fclose(input); fclose(output); return 0; }
int main(int argc, char * argv[]){ int i = 1; setlocale(LC_ALL, ""); GError * error = NULL; GOptionContext * context; context = g_option_context_new("- generate k mixture model"); g_option_context_add_main_entries(context, entries, NULL); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } SystemTableInfo system_table_info; bool retval = system_table_info.load(SYSTEM_TABLE_INFO); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } PhraseLargeTable2 phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load(SYSTEM_PHRASE_INDEX); phrase_table.load(chunk); FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_table_info(); if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(g_k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); while ( i < argc ){ const char * filename = argv[i]; FILE * document = fopen(filename, "r"); if ( NULL == document ){ int err_saved = errno; fprintf(stderr, "can't open file: %s.\n", filename); fprintf(stderr, "error:%s.\n", strerror(err_saved)); exit(err_saved); } HashofDocument hash_of_document = g_hash_table_new (g_direct_hash, g_direct_equal); HashofUnigram hash_of_unigram = g_hash_table_new (g_direct_hash, g_direct_equal); assert(read_document(&phrase_table, &phrase_index, document, hash_of_document, hash_of_unigram)); fclose(document); document = NULL; GHashTableIter iter; gpointer key, value; /* train the document, and convert it to k mixture model. */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { phrase_token_t token1 = GPOINTER_TO_UINT(key); train_second_word(hash_of_unigram, &bigram, hash_of_document, token1); } KMixtureModelMagicHeader magic_header; assert(bigram.get_magic_header(magic_header)); magic_header.m_N ++; assert(bigram.set_magic_header(magic_header)); post_processing_unigram(&bigram, hash_of_unigram); /* free resources of g_hash_of_document */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { HashofSecondWord second_word = (HashofSecondWord) value; g_hash_table_iter_steal(&iter); g_hash_table_unref(second_word); } g_hash_table_unref(hash_of_document); hash_of_document = NULL; g_hash_table_unref(hash_of_unigram); hash_of_unigram = NULL; ++i; } return 0; }
int main(int argc, char * argv[]){ int i = 1; bool train_pi_gram = true; const char * bigram_filename = "bigram.db"; setlocale(LC_ALL, ""); while ( i < argc ){ if ( strcmp("--help", argv[i]) == 0){ print_help(); exit(0); }else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ train_pi_gram = false; }else if ( strcmp("--bigram-file", argv[i]) == 0){ if ( ++i >= argc ) { print_help(); exit(EINVAL); } bigram_filename = argv[i]; }else{ print_help(); exit(EINVAL); } ++i; } PhraseLargeTable2 phrase_table; /* init phrase table */ MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index.prepare_tokens(tokens); char* linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; while( getline(&linebuf, &size, stdin) ){ if ( feof(stdin) ) break; linebuf[strlen(linebuf)-1] = '\0'; glong phrase_len = 0; ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL); phrase_token_t token = null_token; if ( 0 != phrase_len ) { phrase_index.clear_tokens(tokens); int result = phrase_table.search(phrase_len, phrase, tokens); int num = get_first_token(tokens, token); if ( !(result & SEARCH_OK) ) token = null_token; g_free(phrase); phrase = NULL; } last_token = cur_token; cur_token = token; /* skip null_token in second word. */ if ( null_token == cur_token ) continue; /* training uni-gram */ phrase_index.add_unigram_frequency(cur_token, 1); /* skip pi-gram training. */ if ( null_token == last_token ){ if ( !train_pi_gram ) continue; last_token = sentence_start; } /* train bi-gram */ SingleGram * single_gram = NULL; bigram.load(last_token, single_gram); if ( NULL == single_gram ){ single_gram = new SingleGram; } guint32 freq, total_freq; /* increase freq */ if (single_gram->get_freq(cur_token, freq)) assert(single_gram->set_freq(cur_token, freq + 1)); else assert(single_gram->insert_freq(cur_token, 1)); /* increase total freq */ single_gram->get_total_freq(total_freq); single_gram->set_total_freq(total_freq + 1); bigram.store(last_token, single_gram); delete single_gram; } phrase_index.destroy_tokens(tokens); free(linebuf); if (!save_phrase_index(&phrase_index)) exit(ENOENT); return 0; }
int main(int argc, char * argv[]){ int i = 1; const char * k_mixture_model_filename = NULL; setlocale(LC_ALL, ""); while ( i < argc ){ if ( strcmp("--help", argv[i]) == 0 ){ print_help(); exit(0); } else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ g_train_pi_gram = false; } else if ( strcmp("--maximum-occurs-allowed", argv[i]) == 0 ){ if ( ++i >= argc ){ print_help(); exit(EINVAL); } g_maximum_occurs = atoi(argv[i]); } else if ( strcmp("--maximum-increase-rates-allowed", argv[i]) == 0 ){ if ( ++i >= argc ){ print_help(); exit(EINVAL); } g_maximum_increase_rates = atof(argv[i]); } else if ( strcmp("--k-mixture-model-file", argv[i]) == 0 ){ if ( ++i >= argc ){ print_help(); exit(EINVAL); } k_mixture_model_filename = argv[i]; } else { break; } ++i; } PhraseLargeTable2 phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); while ( i < argc ){ const char * filename = argv[i]; FILE * document = fopen(filename, "r"); if ( NULL == document ){ int err_saved = errno; fprintf(stderr, "can't open file: %s.\n", filename); fprintf(stderr, "error:%s.\n", strerror(err_saved)); exit(err_saved); } HashofDocument hash_of_document = g_hash_table_new (g_direct_hash, g_direct_equal); HashofUnigram hash_of_unigram = g_hash_table_new (g_direct_hash, g_direct_equal); assert(read_document(&phrase_table, &phrase_index, document, hash_of_document, hash_of_unigram)); fclose(document); document = NULL; GHashTableIter iter; gpointer key, value; /* train the document, and convert it to k mixture model. */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { phrase_token_t token1 = GPOINTER_TO_UINT(key); train_second_word(hash_of_unigram, &bigram, hash_of_document, token1); } KMixtureModelMagicHeader magic_header; assert(bigram.get_magic_header(magic_header)); magic_header.m_N ++; assert(bigram.set_magic_header(magic_header)); post_processing_unigram(&bigram, hash_of_unigram); /* free resources of g_hash_of_document */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { HashofSecondWord second_word = (HashofSecondWord) value; g_hash_table_iter_steal(&iter); g_hash_table_unref(second_word); } g_hash_table_unref(hash_of_document); hash_of_document = NULL; g_hash_table_unref(hash_of_unigram); hash_of_unigram = NULL; ++i; } return 0; }
int main(int argc, char * argv[]){ FILE * input = stdin; setlocale(LC_ALL, ""); GError * error = NULL; GOptionContext * context; context = g_option_context_new("- generate n-gram"); g_option_context_add_main_entries(context, entries, NULL); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } SystemTableInfo system_table_info; bool retval = system_table_info.load(SYSTEM_TABLE_INFO); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } PhraseLargeTable2 phrase_table; /* init phrase table */ MemoryChunk * chunk = new MemoryChunk; chunk->load(SYSTEM_PHRASE_INDEX); phrase_table.load(chunk); FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_table_info(); if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); char* linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; while( getline(&linebuf, &size, input) ){ if ( feof(input) ) break; if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); last_token = cur_token; cur_token = token; /* skip null_token in second word. */ if ( null_token == cur_token ) continue; /* training uni-gram */ phrase_index.add_unigram_frequency(cur_token, 1); /* skip pi-gram training. */ if ( null_token == last_token ){ if ( !train_pi_gram ) continue; last_token = sentence_start; } /* train bi-gram */ SingleGram * single_gram = NULL; bigram.load(last_token, single_gram); if ( NULL == single_gram ){ single_gram = new SingleGram; } guint32 freq, total_freq; /* increase freq */ if (single_gram->get_freq(cur_token, freq)) assert(single_gram->set_freq(cur_token, freq + 1)); else assert(single_gram->insert_freq(cur_token, 1)); /* increase total freq */ single_gram->get_total_freq(total_freq); single_gram->set_total_freq(total_freq + 1); bigram.store(last_token, single_gram); delete single_gram; } free(linebuf); if (!save_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); return 0; }
int main(int argc, char * argv[]){ setlocale(LC_ALL, ""); SystemTableInfo system_table_info; bool retval = system_table_info.load("../../data/table.conf"); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } /* init phrase table */ FacadePhraseTable2 phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("../../data/phrase_index.bin"); phrase_table.load(chunk, NULL); const pinyin_table_info_t * phrase_files = system_table_info.get_table_info(); /* init phrase index */ FacadePhraseIndex phrase_index; if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); /* init bi-gram */ Bigram system_bigram; system_bigram.attach("../../data/bigram.db", ATTACH_READONLY); Bigram user_bigram; gfloat lambda = system_table_info.get_lambda(); /* init phrase lookup */ PhraseLookup phrase_lookup(lambda, &phrase_table, &phrase_index, &system_bigram, &user_bigram); /* try one sentence */ char * linebuf = NULL; size_t size = 0; ssize_t read; while( (read = getline(&linebuf, &size, stdin)) != -1 ){ if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } if ( strcmp ( linebuf, "quit" ) == 0) break; /* check non-ucs4 characters */ const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); if ( len != num_of_chars ) { fprintf(stderr, "non-ucs4 characters are not accepted.\n"); g_free(sentence); continue; } try_phrase_lookup(&phrase_lookup, sentence, len); g_free(sentence); } free(linebuf); return 0; }