int main(int argc, char * argv[]){ FILE * input = stdin; setlocale(LC_ALL, ""); GError * error = NULL; GOptionContext * context; context = g_option_context_new("- generate n-gram"); g_option_context_add_main_entries(context, entries, NULL); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } SystemTableInfo system_table_info; bool retval = system_table_info.load(SYSTEM_TABLE_INFO); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } PhraseLargeTable2 phrase_table; /* init phrase table */ MemoryChunk * chunk = new MemoryChunk; chunk->load(SYSTEM_PHRASE_INDEX); phrase_table.load(chunk); FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_table_info(); if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); char* linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; while( getline(&linebuf, &size, input) ){ if ( feof(input) ) break; if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); last_token = cur_token; cur_token = token; /* skip null_token in second word. */ if ( null_token == cur_token ) continue; /* training uni-gram */ phrase_index.add_unigram_frequency(cur_token, 1); /* skip pi-gram training. */ if ( null_token == last_token ){ if ( !train_pi_gram ) continue; last_token = sentence_start; } /* train bi-gram */ SingleGram * single_gram = NULL; bigram.load(last_token, single_gram); if ( NULL == single_gram ){ single_gram = new SingleGram; } guint32 freq, total_freq; /* increase freq */ if (single_gram->get_freq(cur_token, freq)) assert(single_gram->set_freq(cur_token, freq + 1)); else assert(single_gram->insert_freq(cur_token, 1)); /* increase total freq */ single_gram->get_total_freq(total_freq); single_gram->set_total_freq(total_freq + 1); bigram.store(last_token, single_gram); delete single_gram; } free(linebuf); if (!save_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); return 0; }
bool read_document(PhraseLargeTable2 * phrase_table, FacadePhraseIndex * phrase_index, FILE * document, HashofDocument hash_of_document, HashofUnigram hash_of_unigram){ char * linebuf = NULL;size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; while ( getline(&linebuf, &size, document) ){ if ( feof(document) ) break; if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf); last_token = cur_token; cur_token = token; /* skip null_token in second word. */ if ( null_token == cur_token ) continue; gpointer value = NULL; gboolean lookup_result = g_hash_table_lookup_extended (hash_of_unigram, GUINT_TO_POINTER(cur_token), NULL, &value); if ( !lookup_result ){ g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token), GUINT_TO_POINTER(1)); } else { guint32 freq = GPOINTER_TO_UINT(value); freq ++; g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token), GUINT_TO_POINTER(freq)); } /* skip pi-gram training. */ if ( null_token == last_token ){ if ( !g_train_pi_gram ) continue; last_token = sentence_start; } /* remember the (last_token, cur_token) word pair. */ HashofSecondWord hash_of_second_word = NULL; lookup_result = g_hash_table_lookup_extended (hash_of_document, GUINT_TO_POINTER(last_token), NULL, &value); if ( !lookup_result ){ hash_of_second_word = g_hash_table_new (g_direct_hash, g_direct_equal); } else { hash_of_second_word = (HashofSecondWord) value; } value = NULL; lookup_result = g_hash_table_lookup_extended (hash_of_second_word, GUINT_TO_POINTER(cur_token), NULL, &value); guint32 count = 0; if ( lookup_result ) { count = GPOINTER_TO_UINT(value); } count ++; g_hash_table_insert(hash_of_second_word, GUINT_TO_POINTER(cur_token), GUINT_TO_POINTER(count)); g_hash_table_insert(hash_of_document, GUINT_TO_POINTER(last_token), hash_of_second_word); } free(linebuf); return true; }
int main(int argc, char * argv[]){ int i = 1; bool train_pi_gram = true; const char * bigram_filename = "deleted_bigram.db"; setlocale(LC_ALL, ""); while ( i < argc ){ if ( strcmp("--help", argv[i]) == 0){ print_help(); exit(0); } else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ train_pi_gram = false; } else if ( strcmp("--deleted-bigram-file", argv[i]) == 0){ if ( ++i >= argc ) { print_help(); exit(EINVAL); } bigram_filename = argv[i]; } else { print_help(); exit(EINVAL); } ++i; } /* load phrase table. */ PhraseLargeTable2 phrase_table; MemoryChunk * new_chunk = new MemoryChunk; new_chunk->load("phrase_index.bin"); phrase_table.load(new_chunk); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENODATA); Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); char* linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; while( getline(&linebuf, &size, stdin) ){ if ( feof(stdin) ) break; if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf); last_token = cur_token; cur_token = token; /* skip null_token in second word. */ if ( null_token == cur_token ) continue; /* skip pi-gram training. */ if ( null_token == last_token ){ if ( !train_pi_gram ) continue; last_token = sentence_start; } /* train bi-gram */ SingleGram * single_gram = NULL; bigram.load(last_token, single_gram); if ( NULL == single_gram ){ single_gram = new SingleGram; } guint32 freq, total_freq; //increase freq if (single_gram->get_freq(cur_token, freq)) assert(single_gram->set_freq(cur_token, freq + 1)); else assert(single_gram->insert_freq(cur_token, 1)); //increase total freq single_gram->get_total_freq(total_freq); single_gram->set_total_freq(total_freq + 1); bigram.store(last_token, single_gram); delete single_gram; } free(linebuf); return 0; }