int main(int argc, char * argv[]){ int i = 1; const char * k_mixture_model_filename = NULL; setlocale(LC_ALL, ""); while ( i < argc ){ if ( strcmp("--help", argv[i]) == 0 ){ print_help(); exit(0); } else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ g_train_pi_gram = false; } else if ( strcmp("--maximum-occurs-allowed", argv[i]) == 0 ){ if ( ++i >= argc ){ print_help(); exit(EINVAL); } g_maximum_occurs = atoi(argv[i]); } else if ( strcmp("--maximum-increase-rates-allowed", argv[i]) == 0 ){ if ( ++i >= argc ){ print_help(); exit(EINVAL); } g_maximum_increase_rates = atof(argv[i]); } else if ( strcmp("--k-mixture-model-file", argv[i]) == 0 ){ if ( ++i >= argc ){ print_help(); exit(EINVAL); } k_mixture_model_filename = argv[i]; } else { break; } ++i; } PhraseLargeTable2 phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); while ( i < argc ){ const char * filename = argv[i]; FILE * document = fopen(filename, "r"); if ( NULL == document ){ int err_saved = errno; fprintf(stderr, "can't open file: %s.\n", filename); fprintf(stderr, "error:%s.\n", strerror(err_saved)); exit(err_saved); } HashofDocument hash_of_document = g_hash_table_new (g_direct_hash, g_direct_equal); HashofUnigram hash_of_unigram = g_hash_table_new (g_direct_hash, g_direct_equal); assert(read_document(&phrase_table, &phrase_index, document, hash_of_document, hash_of_unigram)); fclose(document); document = NULL; GHashTableIter iter; gpointer key, value; /* train the document, and convert it to k mixture model. */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { phrase_token_t token1 = GPOINTER_TO_UINT(key); train_second_word(hash_of_unigram, &bigram, hash_of_document, token1); } KMixtureModelMagicHeader magic_header; assert(bigram.get_magic_header(magic_header)); magic_header.m_N ++; assert(bigram.set_magic_header(magic_header)); post_processing_unigram(&bigram, hash_of_unigram); /* free resources of g_hash_of_document */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { HashofSecondWord second_word = (HashofSecondWord) value; g_hash_table_iter_steal(&iter); g_hash_table_unref(second_word); } g_hash_table_unref(hash_of_document); hash_of_document = NULL; g_hash_table_unref(hash_of_unigram); hash_of_unigram = NULL; ++i; } return 0; }
int main(int argc, char * argv[]){ int i = 1; setlocale(LC_ALL, ""); GError * error = NULL; GOptionContext * context; context = g_option_context_new("- generate k mixture model"); g_option_context_add_main_entries(context, entries, NULL); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } SystemTableInfo2 system_table_info; bool retval = system_table_info.load(SYSTEM_TABLE_INFO); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } PhraseLargeTable3 phrase_table; phrase_table.attach(SYSTEM_PHRASE_INDEX, ATTACH_READONLY); FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_default_tables(); if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(g_k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); while ( i < argc ){ const char * filename = argv[i]; FILE * document = fopen(filename, "r"); if ( NULL == document ){ int err_saved = errno; fprintf(stderr, "can't open file: %s.\n", filename); fprintf(stderr, "error:%s.\n", strerror(err_saved)); exit(err_saved); } HashofDocument hash_of_document = g_hash_table_new (g_direct_hash, g_direct_equal); HashofUnigram hash_of_unigram = g_hash_table_new (g_direct_hash, g_direct_equal); assert(read_document(&phrase_table, &phrase_index, document, hash_of_document, hash_of_unigram)); fclose(document); document = NULL; GHashTableIter iter; gpointer key, value; /* train the document, and convert it to k mixture model. */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { phrase_token_t token1 = GPOINTER_TO_UINT(key); train_second_word(hash_of_unigram, &bigram, hash_of_document, token1); } KMixtureModelMagicHeader magic_header; assert(bigram.get_magic_header(magic_header)); magic_header.m_N ++; assert(bigram.set_magic_header(magic_header)); post_processing_unigram(&bigram, hash_of_unigram); /* free resources of g_hash_of_document */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { HashofSecondWord second_word = (HashofSecondWord) value; g_hash_table_iter_steal(&iter); g_hash_table_unref(second_word); } g_hash_table_unref(hash_of_document); hash_of_document = NULL; g_hash_table_unref(hash_of_unigram); hash_of_unigram = NULL; ++i; } return 0; }