int main(int argc, char * argv[]){ FILE * input = stdin; const char * bigram_filename = "bigram.db"; PhraseLargeTable phrases; MemoryChunk * chunk = new MemoryChunk; bool retval = chunk->load("phrase_index.bin"); if (!retval) { fprintf(stderr, "open phrase_index.bin failed!\n"); exit(ENOENT); } phrases.load(chunk); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); Bigram bigram; retval = bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); if (!retval) { fprintf(stderr, "open %s failed!\n", bigram_filename); exit(ENOENT); } taglib_init(); values = g_ptr_array_new(); required = g_hash_table_new(g_str_hash, g_str_equal); //enter "\data" line assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", "")); ssize_t result = my_getline(input); if ( result == -1 ) { fprintf(stderr, "empty file input.\n"); exit(ENODATA); } //read "\data" line if ( !taglib_read(linebuf, line_type, values, required) ) { fprintf(stderr, "error: interpolation model expected.\n"); exit(ENODATA); } assert(line_type == BEGIN_LINE); char * value = NULL; assert(g_hash_table_lookup_extended(required, "model", NULL, (gpointer *)&value)); if ( !( strcmp("interpolation", value) == 0 ) ) { fprintf(stderr, "error: interpolation model expected.\n"); exit(ENODATA); } result = my_getline(input); if ( result != -1 ) parse_body(input, &phrases, &phrase_index, &bigram); taglib_fini(); if (!save_phrase_index(&phrase_index)) exit(ENOENT); return 0; }
int main(int argc, char * argv[]){ int i = 1; const char * table_dir = "."; setlocale(LC_ALL, ""); while ( i < argc ){ if ( strcmp("--help", argv[i]) == 0 ){ print_help(); exit(0); } else if ( strcmp("--table-dir", argv[i]) == 0){ if ( ++i >= argc ){ print_help(); exit(EINVAL); } table_dir = argv[i]; } ++i; } /* generate pinyin index*/ pinyin_option_t options = USE_TONE; ChewingLargeTable chewinglargetable(options); PhraseLargeTable phraselargetable; /* generate phrase index */ FacadePhraseIndex phrase_index; for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { const pinyin_table_info_t * table_info = pinyin_phrase_files + i; if (SYSTEM_FILE != table_info->m_file_type) continue; const char * tablename = table_info->m_table_filename; gchar * filename = g_build_filename(table_dir, tablename, NULL); FILE * tablefile = fopen(filename, "r"); if (NULL == tablefile) { fprintf(stderr, "open %s failed!\n", tablename); exit(ENOENT); } chewinglargetable.load_text(tablefile); fseek(tablefile, 0L, SEEK_SET); phraselargetable.load_text(tablefile); fseek(tablefile, 0L, SEEK_SET); phrase_index.load_text(i, tablefile); fclose(tablefile); g_free(filename); } MemoryChunk * new_chunk = new MemoryChunk; chewinglargetable.store(new_chunk); new_chunk->save("pinyin_index.bin"); chewinglargetable.load(new_chunk); new_chunk = new MemoryChunk; phraselargetable.store(new_chunk); new_chunk->save("phrase_index.bin"); phraselargetable.load(new_chunk); phrase_index.compact(); if (!save_phrase_index(&phrase_index)) exit(ENOENT); return 0; }
int main(int argc, char * argv[]){ FILE * input = stdin; setlocale(LC_ALL, ""); GError * error = NULL; GOptionContext * context; context = g_option_context_new("- generate n-gram"); g_option_context_add_main_entries(context, entries, NULL); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } SystemTableInfo system_table_info; bool retval = system_table_info.load(SYSTEM_TABLE_INFO); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } PhraseLargeTable2 phrase_table; /* init phrase table */ MemoryChunk * chunk = new MemoryChunk; chunk->load(SYSTEM_PHRASE_INDEX); phrase_table.load(chunk); FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_table_info(); if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); char* linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; while( getline(&linebuf, &size, input) ){ if ( feof(input) ) break; if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); last_token = cur_token; cur_token = token; /* skip null_token in second word. */ if ( null_token == cur_token ) continue; /* training uni-gram */ phrase_index.add_unigram_frequency(cur_token, 1); /* skip pi-gram training. */ if ( null_token == last_token ){ if ( !train_pi_gram ) continue; last_token = sentence_start; } /* train bi-gram */ SingleGram * single_gram = NULL; bigram.load(last_token, single_gram); if ( NULL == single_gram ){ single_gram = new SingleGram; } guint32 freq, total_freq; /* increase freq */ if (single_gram->get_freq(cur_token, freq)) assert(single_gram->set_freq(cur_token, freq + 1)); else assert(single_gram->insert_freq(cur_token, 1)); /* increase total freq */ single_gram->get_total_freq(total_freq); single_gram->set_total_freq(total_freq + 1); bigram.store(last_token, single_gram); delete single_gram; } free(linebuf); if (!save_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); return 0; }
int main(int argc, char * argv[]){ int i = 1; bool train_pi_gram = true; const char * bigram_filename = "bigram.db"; setlocale(LC_ALL, ""); while ( i < argc ){ if ( strcmp("--help", argv[i]) == 0){ print_help(); exit(0); }else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ train_pi_gram = false; }else if ( strcmp("--bigram-file", argv[i]) == 0){ if ( ++i >= argc ) { print_help(); exit(EINVAL); } bigram_filename = argv[i]; }else{ print_help(); exit(EINVAL); } ++i; } PhraseLargeTable2 phrase_table; /* init phrase table */ MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index.prepare_tokens(tokens); char* linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; while( getline(&linebuf, &size, stdin) ){ if ( feof(stdin) ) break; linebuf[strlen(linebuf)-1] = '\0'; glong phrase_len = 0; ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL); phrase_token_t token = null_token; if ( 0 != phrase_len ) { phrase_index.clear_tokens(tokens); int result = phrase_table.search(phrase_len, phrase, tokens); int num = get_first_token(tokens, token); if ( !(result & SEARCH_OK) ) token = null_token; g_free(phrase); phrase = NULL; } last_token = cur_token; cur_token = token; /* skip null_token in second word. */ if ( null_token == cur_token ) continue; /* training uni-gram */ phrase_index.add_unigram_frequency(cur_token, 1); /* skip pi-gram training. */ if ( null_token == last_token ){ if ( !train_pi_gram ) continue; last_token = sentence_start; } /* train bi-gram */ SingleGram * single_gram = NULL; bigram.load(last_token, single_gram); if ( NULL == single_gram ){ single_gram = new SingleGram; } guint32 freq, total_freq; /* increase freq */ if (single_gram->get_freq(cur_token, freq)) assert(single_gram->set_freq(cur_token, freq + 1)); else assert(single_gram->insert_freq(cur_token, 1)); /* increase total freq */ single_gram->get_total_freq(total_freq); single_gram->set_total_freq(total_freq + 1); bigram.store(last_token, single_gram); delete single_gram; } phrase_index.destroy_tokens(tokens); free(linebuf); if (!save_phrase_index(&phrase_index)) exit(ENOENT); return 0; }