int main(int argc, char * argv[]){ GError * error = NULL; GOptionContext * context; context = g_option_context_new("- validate k mixture model"); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } if (2 != argc) { fprintf(stderr, "wrong arguments.\n"); exit(EINVAL); } const char * k_mixture_model_filename = argv[1]; KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(k_mixture_model_filename, ATTACH_READONLY); if (!validate_unigram(&bigram)) { fprintf(stderr, "k mixture model validation failed.\n"); exit(ENODATA); } if (!validate_bigram(&bigram)) { fprintf(stderr, "k mixture model validation failed.\n"); exit(ENODATA); } return 0; }
int main(int argc, char * argv[]){ FILE * input = stdin; setlocale(LC_ALL, ""); GError * error = NULL; GOptionContext * context; context = g_option_context_new("- import k mixture model"); g_option_context_add_main_entries(context, entries, NULL); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } SystemTableInfo2 system_table_info; bool retval = system_table_info.load(SYSTEM_TABLE_INFO); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } PhraseLargeTable3 phrase_table; phrase_table.attach(SYSTEM_PHRASE_INDEX, ATTACH_READONLY); FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_default_tables(); if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); taglib_init(); /* prepare to read n-gram model */ values = g_ptr_array_new(); required = g_hash_table_new(g_str_hash, g_str_equal); ssize_t result = my_getline(input); if ( result == -1 ) { fprintf(stderr, "empty file input.\n"); exit(ENODATA); } if (!parse_headline(&bigram)) exit(ENODATA); result = my_getline(input); if ( result != -1 ) parse_body(input, &phrase_table, &phrase_index, &bigram); taglib_fini(); return 0; }
int main(int argc, char * argv[]){ int i = 1; const char * k_mixture_model_filename = NULL; while ( i < argc ){ if ( strcmp ("--help", argv[i]) == 0 ){ print_help(); exit(0); } else { k_mixture_model_filename = argv[i]; } ++i; } KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(k_mixture_model_filename, ATTACH_READONLY); if (!validate_unigram(&bigram)) { fprintf(stderr, "k mixture model validation failed.\n"); exit(ENODATA); } if (!validate_bigram(&bigram)) { fprintf(stderr, "k mixture model validation failed.\n"); exit(ENODATA); } return 0; }
int main(int argc, char * argv[]){ setlocale(LC_ALL, ""); GError * error = NULL; GOptionContext * context; context = g_option_context_new("- estimate k mixture model"); g_option_context_add_main_entries(context, entries, NULL); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } /* magic header signature check here. */ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(bigram_filename, ATTACH_READONLY); KMixtureModelBigram deleted_bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); deleted_bigram.attach(deleted_bigram_filename, ATTACH_READONLY); GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); deleted_bigram.get_all_items(deleted_items); parameter_t lambda_sum = 0; int lambda_count = 0; for( size_t i = 0; i < deleted_items->len; ++i ){ phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i); KMixtureModelSingleGram * single_gram = NULL; bigram.load(*token, single_gram, true); KMixtureModelSingleGram * deleted_single_gram = NULL; deleted_bigram.load(*token, deleted_single_gram); KMixtureModelArrayHeader array_header; if (single_gram) assert(single_gram->get_array_header(array_header)); KMixtureModelArrayHeader deleted_array_header; assert(deleted_single_gram->get_array_header(deleted_array_header)); if ( 0 != deleted_array_header.m_WC ) { parameter_t lambda = compute_interpolation(deleted_single_gram, &bigram, single_gram); printf("token:%d lambda:%f\n", *token, lambda); lambda_sum += lambda; lambda_count ++; } if (single_gram) delete single_gram; delete deleted_single_gram; } printf("average lambda:%f\n", (lambda_sum/lambda_count)); g_array_free(deleted_items, TRUE); return 0; }
void JelinekMercerFeature::read_counts(const std::string filename) { std::ifstream infile; infile.open(filename.c_str()); int type; int count; int contexts; int next; std::string word; infile >> _corpora; infile >> _vocab; _types.resize(_vocab); _corpus_names.resize(_corpora); for (int ii=0; ii < _vocab; ++ii) { infile >> word; _types[ii] = word; if (ii % 25000 == 0) std::cout << "Read vocab " << word << " (" << ii << "/" << _vocab << ")" << std::endl; } /* * Size the counts appropriately */ assert(_corpora > 0); this->_unigram.resize(_corpora); this->_bigram.resize(_corpora); this->_normalizer.resize(_corpora); this->_compare.resize(_corpora); for (int cc=0; cc < _corpora; ++cc) { infile >> _corpus_names[cc]; infile >> _compare[cc]; if (cc % 1000 == 0) std::cout << "Read corpus for " << _corpus_names[cc] << " (" << cc << "/" << _corpora << ")" << std::endl; _unigram[cc].resize(_vocab); _normalizer[cc] = 0; for (int vv=0; vv < _vocab; ++vv) { infile >> type; infile >> count; infile >> contexts; assert(type == vv); // Set unigram counts _normalizer[cc] += (float)count + kSMOOTH; _unigram[cc][type] = count; for (int bb=0; bb<contexts; ++bb) { infile >> next; infile >> count; _bigram[cc][bigram(type, next)] = count; } } } std::cout << "Done reading corpus" << std::endl; }
double CBigramHistory::pr(uint32_t* its_wid, uint32_t* ite_wid, uint32_t wid) { TBigram bigram(DCWID, DCWID); if (its_wid != ite_wid) bigram.first = *(ite_wid - 1); bigram.second = wid; return pr(bigram); }
void CBigramHistory::forget(uint32_t *its_wid, uint32_t *ite_wid) { for (; its_wid < ite_wid; ++its_wid) { TBigram bigram(*its_wid, DCWID); if (its_wid + 1 != ite_wid) bigram.second = *(its_wid + 1); TBigramPool::iterator it = m_bifreq.find(bigram); if (it != m_bifreq.end()) m_bifreq.erase(it); } }
int main(int argc, char * argv[]){ int i = 1; const char * k_mixture_model_filename = NULL; FILE * output = stdout; while ( i < argc ){ if ( strcmp ("--help", argv[i]) == 0 ){ print_help(); exit(0); } else if ( strcmp ("--k-mixture-model-file", argv[i]) == 0 ){ if ( ++i > argc ){ print_help(); exit(EINVAL); } k_mixture_model_filename = argv[i]; } else { print_help(); exit(EINVAL); } ++i; } FacadePhraseIndex phrase_index; //gb_char binary file MemoryChunk * chunk = new MemoryChunk; chunk->load("gb_char.bin"); phrase_index.load(1, chunk); //gbk_char binary file chunk = new MemoryChunk; chunk->load("gbk_char.bin"); phrase_index.load(2, chunk); KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(k_mixture_model_filename, ATTACH_READONLY); print_k_mixture_model_magic_header(output, &bigram); print_k_mixture_model_array_headers(output, &bigram, &phrase_index); print_k_mixture_model_array_items(output, &bigram, &phrase_index); end_data(output); return 0; }
bool CBigramHistory::loadFromBuffer(void* buf_ptr, size_t sz) { clear(); sz /= sizeof(uint32_t); uint32_t *pw = (uint32_t*)buf_ptr; if (pw && sz > 0) { #ifndef WORDS_BIGENDIAN std::transform(pw, pw + sz, pw, swap32); #endif TBigram bigram(DCWID, DCWID); for (size_t i = 0; i < sz; ++i) { bigram.first = bigram.second; bigram.second = *pw++; m_memory.push_back(bigram.second); incUniFreq(bigram.second); incBiFreq(bigram); } } return true; }
bool CBigramHistory::memorize(uint32_t* its_wid, uint32_t* ite_wid) { TBigram bigram(DCWID, DCWID); // First, we insert a DC word id before the context history // to separated from previous stream. if (m_memory.size() == contxt_memory_size) { TBigram hb; hb.first = m_memory.front(); m_memory.pop_front(); hb.second = m_memory.front(); decUniFreq(hb.first); decBiFreq(hb); } m_memory.push_back(DCWID); //Now trying to memorize new stream and forget oldest for (; its_wid != ite_wid; ++its_wid) { if (m_memory.size() == contxt_memory_size) { TBigram hb; hb.first = m_memory.front(); m_memory.pop_front(); hb.second = m_memory.front(); decUniFreq(hb.first); decBiFreq(hb); } bigram.first = bigram.second; bigram.second = *its_wid; m_memory.push_back(*its_wid); incUniFreq(bigram.second); incBiFreq(bigram); } return true; }
int main(int argc, char * argv[]){ int i = 1; const char * k_mixture_model_filename = NULL; setlocale(LC_ALL, ""); while ( i < argc ){ if ( strcmp("--help", argv[i]) == 0 ){ print_help(); exit(0); } else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ g_train_pi_gram = false; } else if ( strcmp("--maximum-occurs-allowed", argv[i]) == 0 ){ if ( ++i >= argc ){ print_help(); exit(EINVAL); } g_maximum_occurs = atoi(argv[i]); } else if ( strcmp("--maximum-increase-rates-allowed", argv[i]) == 0 ){ if ( ++i >= argc ){ print_help(); exit(EINVAL); } g_maximum_increase_rates = atof(argv[i]); } else if ( strcmp("--k-mixture-model-file", argv[i]) == 0 ){ if ( ++i >= argc ){ print_help(); exit(EINVAL); } k_mixture_model_filename = argv[i]; } else { break; } ++i; } PhraseLargeTable2 phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); while ( i < argc ){ const char * filename = argv[i]; FILE * document = fopen(filename, "r"); if ( NULL == document ){ int err_saved = errno; fprintf(stderr, "can't open file: %s.\n", filename); fprintf(stderr, "error:%s.\n", strerror(err_saved)); exit(err_saved); } HashofDocument hash_of_document = g_hash_table_new (g_direct_hash, g_direct_equal); HashofUnigram hash_of_unigram = g_hash_table_new (g_direct_hash, g_direct_equal); assert(read_document(&phrase_table, &phrase_index, document, hash_of_document, hash_of_unigram)); fclose(document); document = NULL; GHashTableIter iter; gpointer key, value; /* train the document, and convert it to k mixture model. */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { phrase_token_t token1 = GPOINTER_TO_UINT(key); train_second_word(hash_of_unigram, &bigram, hash_of_document, token1); } KMixtureModelMagicHeader magic_header; assert(bigram.get_magic_header(magic_header)); magic_header.m_N ++; assert(bigram.set_magic_header(magic_header)); post_processing_unigram(&bigram, hash_of_unigram); /* free resources of g_hash_of_document */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { HashofSecondWord second_word = (HashofSecondWord) value; g_hash_table_iter_steal(&iter); g_hash_table_unref(second_word); } g_hash_table_unref(hash_of_document); hash_of_document = NULL; g_hash_table_unref(hash_of_unigram); hash_of_unigram = NULL; ++i; } return 0; }
int main(int argc, char * argv[]){ int i = 1; const char * bigram_filename = NULL; setlocale(LC_ALL, ""); while ( i < argc ){ if ( strcmp("--help", argv[i]) == 0 ){ print_help(); exit(0); } else if ( strcmp("-k", argv[i]) == 0 ){ if ( ++i >= argc ){ print_help(); exit(EINVAL); } g_prune_k = atoi(argv[i]); } else if ( strcmp("--CDF", argv[i]) == 0 ){ if ( ++i >= argc ){ print_help(); exit(EINVAL); } g_prune_poss = atof(argv[i]); } else { bigram_filename = argv[i]; } ++i; } /* TODO: magic header signature check here. */ KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(bigram_filename, ATTACH_READWRITE); KMixtureModelMagicHeader magic_header; bigram.get_magic_header(magic_header); GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); bigram.get_all_items(items); /* print prune progress */ size_t progress = 0; size_t onestep = items->len / 20; for ( size_t i = 0; i < items->len; ++i ){ if ( progress >= onestep ) { progress = 0; fprintf(stderr, "*"); } progress ++; phrase_token_t * token = &g_array_index(items, phrase_token_t, i); KMixtureModelSingleGram * single_gram = NULL; bigram.load(*token, single_gram); FlexibleBigramPhraseArray removed_array = g_array_new(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken)); prune_k_mixture_model(&magic_header, single_gram, removed_array); bigram.store(*token, single_gram); delete single_gram; /* post processing for unigram reduce */ for (size_t m = 0; m < removed_array->len; ++m ){ KMixtureModelArrayItemWithToken * item = &g_array_index(removed_array, KMixtureModelArrayItemWithToken, m); KMixtureModelArrayHeader array_header; assert(bigram.get_array_header(item->m_token, array_header)); array_header.m_freq -= item->m_item.m_WC; assert(array_header.m_freq >= 0); assert(bigram.set_array_header(item->m_token, array_header)); } g_array_free(removed_array, TRUE); removed_array = NULL; } fprintf(stderr, "\n"); bigram.set_magic_header(magic_header); /* post processing clean up zero items */ KMixtureModelArrayHeader array_header; for ( size_t i = 0; i < items->len; ++i ){ phrase_token_t * token = &g_array_index(items, phrase_token_t, i); assert(bigram.get_array_header(*token, array_header)); if ( 0 == array_header.m_WC && 0 == array_header.m_freq ) assert(bigram.remove(*token)); } g_array_free(items, TRUE); return 0; }
int JelinekMercerFeature::bigram_count(int corpus, int first, int second) { bigram key = bigram(first, second); if (_bigram[corpus].find(key) == _bigram[corpus].end()) return 0; else return _bigram[corpus][key]; }
int main(int argc, char * argv[]){ int i = 1; setlocale(LC_ALL, ""); GError * error = NULL; GOptionContext * context; context = g_option_context_new("- generate k mixture model"); g_option_context_add_main_entries(context, entries, NULL); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } SystemTableInfo2 system_table_info; bool retval = system_table_info.load(SYSTEM_TABLE_INFO); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } PhraseLargeTable3 phrase_table; phrase_table.attach(SYSTEM_PHRASE_INDEX, ATTACH_READONLY); FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_default_tables(); if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(g_k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE); while ( i < argc ){ const char * filename = argv[i]; FILE * document = fopen(filename, "r"); if ( NULL == document ){ int err_saved = errno; fprintf(stderr, "can't open file: %s.\n", filename); fprintf(stderr, "error:%s.\n", strerror(err_saved)); exit(err_saved); } HashofDocument hash_of_document = g_hash_table_new (g_direct_hash, g_direct_equal); HashofUnigram hash_of_unigram = g_hash_table_new (g_direct_hash, g_direct_equal); assert(read_document(&phrase_table, &phrase_index, document, hash_of_document, hash_of_unigram)); fclose(document); document = NULL; GHashTableIter iter; gpointer key, value; /* train the document, and convert it to k mixture model. */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { phrase_token_t token1 = GPOINTER_TO_UINT(key); train_second_word(hash_of_unigram, &bigram, hash_of_document, token1); } KMixtureModelMagicHeader magic_header; assert(bigram.get_magic_header(magic_header)); magic_header.m_N ++; assert(bigram.set_magic_header(magic_header)); post_processing_unigram(&bigram, hash_of_unigram); /* free resources of g_hash_of_document */ g_hash_table_iter_init(&iter, hash_of_document); while (g_hash_table_iter_next(&iter, &key, &value)) { HashofSecondWord second_word = (HashofSecondWord) value; g_hash_table_iter_steal(&iter); g_hash_table_unref(second_word); } g_hash_table_unref(hash_of_document); hash_of_document = NULL; g_hash_table_unref(hash_of_unigram); hash_of_unigram = NULL; ++i; } return 0; }
int main(int argc, char * argv[]) { FlexibleSingleGram<guint32, guint32> single_gram; typedef FlexibleSingleGram<guint32, guint32>::ArrayItemWithToken array_item_t; const guint32 total_freq = 16; assert(single_gram.set_array_header(total_freq)); phrase_token_t tokens[6] = { 2, 6, 4, 3, 1, 3 }; guint32 freqs[6] = { 1, 2, 4, 8, 16, 32}; guint32 freq; for ( size_t i = 0; i < 6; ++i ){ if ( single_gram.get_array_item(tokens[i], freq) ) assert(single_gram.set_array_item(tokens[i], freqs[i])); else assert(single_gram.insert_array_item(tokens[i], freqs[i])); } single_gram.get_array_item(3, freq); assert(freq == 32); printf("--------------------------------------------------------\n"); PhraseIndexRange range; FlexibleBigramPhraseArray array = g_array_new(FALSE, FALSE, sizeof(array_item_t)); range.m_range_begin = 0; range.m_range_end = 8; single_gram.search(&range, array); for ( size_t i = 0; i < array->len; ++i ){ array_item_t * item = &g_array_index(array, array_item_t, i); printf("item:%d:%d\n", item->m_token, item->m_item); } assert(single_gram.get_array_header(freq)); assert(freq == total_freq); FlexibleBigram<guint32, guint32, guint32> bigram("TEST"); assert(bigram.attach("/tmp/training.db", ATTACH_READWRITE|ATTACH_CREATE)); bigram.store(1, &single_gram); assert(single_gram.insert_array_item(5, 8)); assert(single_gram.remove_array_item(1, freq)); assert(single_gram.set_array_header(32)); assert(single_gram.get_array_header(freq)); printf("new array header:%d\n", freq); bigram.store(2, &single_gram); for (int m = 1; m <= 2; ++m ){ printf("--------------------------------------------------------\n"); FlexibleSingleGram<guint32, guint32> * train_gram; bigram.load(m, train_gram); g_array_set_size(array, 0); range.m_range_begin = 0; range.m_range_end = 8; train_gram->search(&range, array); for ( size_t i = 0; i < array->len; ++i ){ array_item_t * item = &g_array_index(array, array_item_t, i); printf("item:%d:%d\n", item->m_token, item->m_item); } delete train_gram; } GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); bigram.get_all_items(items); printf("-----------------------items----------------------------\n"); for ( size_t i = 0; i < items->len; ++i ){ phrase_token_t * token = &g_array_index(items, phrase_token_t, i); printf("item:%d\n", *token); } printf("-----------------------magic header---------------------\n"); bigram.set_magic_header(total_freq); bigram.get_magic_header(freq); assert(total_freq == freq); printf("magic header:%d\n", freq); printf("-----------------------array header---------------------\n"); for ( size_t i = 1; i <= 2; ++i){ bigram.get_array_header(i, freq); printf("single gram: %d, freq:%d\n", i, freq); } bigram.set_array_header(1, 1); printf("-----------------------array header---------------------\n"); for ( size_t i = 1; i <= 2; ++i){ bigram.get_array_header(i, freq); printf("single gram: %d, freq:%d\n", i, freq); } for (int m = 1; m <= 2; ++m ){ printf("--------------------------------------------------------\n"); FlexibleSingleGram<guint32, guint32> * train_gram; bigram.load(m, train_gram); g_array_set_size(array, 0); range.m_range_begin = 0; range.m_range_end = 8; train_gram->search(&range, array); for ( size_t i = 0; i < array->len; ++i ){ array_item_t * item = &g_array_index(array, array_item_t, i); printf("item:%d:%d\n", item->m_token, item->m_item); } delete train_gram; } assert(bigram.remove(1)); bigram.get_all_items(items); printf("-----------------------items----------------------------\n"); for ( size_t i = 0; i < items->len; ++i ){ phrase_token_t * token = &g_array_index(items, phrase_token_t, i); printf("item:%d\n", *token); } g_array_free(items, TRUE); g_array_free(array, TRUE); return 0; }