int main( int argc, char * argv[]){ SystemTableInfo system_table_info; bool retval = system_table_info.load("../../data/table.conf"); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } pinyin_option_t options = USE_TONE; FacadeChewingTable largetable; MemoryChunk * chunk = new MemoryChunk; chunk->load("../../data/pinyin_index.bin"); largetable.load(options, chunk, NULL); const pinyin_table_info_t * phrase_files = system_table_info.get_table_info(); FacadePhraseIndex phrase_index; if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); Bigram system_bigram; system_bigram.attach("../../data/bigram.db", ATTACH_READONLY); Bigram user_bigram; user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE); gfloat lambda = system_table_info.get_lambda(); PinyinLookup2 pinyin_lookup(lambda, options, &largetable, &phrase_index, &system_bigram, &user_bigram); /* prepare the prefixes for get_best_match. */ TokenVector prefixes = g_array_new (FALSE, FALSE, sizeof(phrase_token_t)); g_array_append_val(prefixes, sentence_start); CandidateConstraints constraints = g_array_new (TRUE, FALSE, sizeof(lookup_constraint_t)); MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); char* linebuf = NULL; size_t size = 0; ssize_t read; while( (read = getline(&linebuf, &size, stdin)) != -1 ){ if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } if ( strcmp ( linebuf, "quit" ) == 0) break; FullPinyinParser2 parser; ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); ChewingKeyRestVector key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); parser.parse(options, keys, key_rests, linebuf, strlen(linebuf)); if ( 0 == keys->len ) /* invalid pinyin */ continue; /* initialize constraints. */ g_array_set_size(constraints, keys->len); for ( size_t i = 0; i < constraints->len; ++i){ lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i); constraint->m_type = NO_CONSTRAINT; } guint32 start_time = record_time(); for ( size_t i = 0; i < bench_times; ++i) pinyin_lookup.get_best_match(prefixes, keys, constraints, results); print_time(start_time, bench_times); for ( size_t i = 0; i < results->len; ++i){ phrase_token_t * token = &g_array_index(results, phrase_token_t, i); if ( null_token == *token) continue; printf("pos:%ld,token:%d\t", i, *token); } printf("\n"); char * sentence = NULL; pinyin_lookup.convert_to_utf8(results, sentence); printf("%s\n", sentence); g_array_free(keys, TRUE); g_array_free(key_rests, TRUE); g_free(sentence); } g_array_free(prefixes, TRUE); g_array_free(constraints, TRUE); g_array_free(results, TRUE); free(linebuf); return 0; }
void feed_line(const char * phrase, const char * pinyin, const guint32 freq) { phrase_item * item = new phrase_item; item->length = g_utf8_strlen(phrase, -1); /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp * where is the code which I don't want to touch. :-) */ if (item->length >= MAX_PHRASE_LENGTH) { fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq); delete item; return; } item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL); FullPinyinParser2 parser; ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); ChewingKeyRestVector key_rests = g_array_new (FALSE, FALSE, sizeof(ChewingKeyRest)); pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE; parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); assert(keys->len == key_rests->len); if (keys->len != item->length) { fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq); delete item; return; } GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item); chewing_and_freq_item value_item; value_item.keys = keys; value_item.key_rests = key_rests; value_item.freq = freq; assert(item->length == value_item.keys->len); if (NULL == array) { array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item)); g_array_append_val(array, value_item); g_tree_insert(g_chewing_tree, item, array); return; } bool found = false; for (size_t i = 0; i < array->len; ++i) { chewing_and_freq_item * cur_item = &g_array_index(array, chewing_and_freq_item, i); int result = pinyin_exact_compare2 ((ChewingKey *) value_item.keys->data, (ChewingKey *) cur_item->keys->data, value_item.keys->len); if (0 == result) { fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n", phrase, pinyin, freq); cur_item->freq += freq; found = true; } } if (!found) { g_array_append_val(array, value_item); g_tree_insert(g_chewing_tree, item, array); } else { /* clean up */ g_array_free(keys, TRUE); g_array_free(key_rests, TRUE); } delete item; }
bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){ SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index]; if ( !sub_phrases ){ sub_phrases = new SubPhraseIndex; } char pinyin[256]; char phrase[256]; phrase_token_t token; size_t freq; PhraseItem * item_ptr = new PhraseItem; phrase_token_t cur_token = 0; while ( !feof(infile)){ fscanf(infile, "%s", pinyin); fscanf(infile, "%s", phrase); fscanf(infile, "%u", &token); fscanf(infile, "%ld", &freq); if ( feof(infile) ) break; assert(PHRASE_INDEX_LIBRARY_INDEX(token) == phrase_index ); glong written; utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL, &written, NULL); if ( 0 == cur_token ){ cur_token = token; item_ptr->set_phrase_string(written, phrase_utf16); } if ( cur_token != token ){ add_phrase_item( cur_token, item_ptr); delete item_ptr; item_ptr = new PhraseItem; cur_token = token; item_ptr->set_phrase_string(written, phrase_utf16); } pinyin_option_t options = USE_TONE; FullPinyinParser2 parser; ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); ChewingKeyRestVector key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); parser.parse(options, keys, key_rests, pinyin, strlen(pinyin)); if (item_ptr->get_phrase_length() == keys->len) { item_ptr->append_pronunciation((ChewingKey *)keys->data, freq); } else { fprintf(stderr, "FacadePhraseIndex::load_text:%s\t%s\n", pinyin, phrase); } g_array_free(keys, TRUE); g_array_free(key_rests, TRUE); g_free(phrase_utf16); } add_phrase_item( cur_token, item_ptr); delete item_ptr; m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq(); return true; }
int main(int argc, char * argv[]) { pinyin_option_t options = USE_TONE | PINYIN_INCOMPLETE; ChewingLargeTable largetable(options); FacadePhraseIndex phrase_index; FILE * gbfile = fopen("../../data/gb_char.table", "r"); if (NULL == gbfile) { fprintf(stderr, "open gb_char.table failed!\n"); exit(ENOENT); } largetable.load_text(gbfile); fseek(gbfile, 0L, SEEK_SET); phrase_index.load_text(1, gbfile); fclose(gbfile); FILE * gbkfile = fopen("../../data/gbk_char.table", "r"); if (NULL == gbkfile) { fprintf(stderr, "open gbk_char.table failed!\n"); exit(ENOENT); } largetable.load_text(gbkfile); fseek(gbkfile, 0L, SEEK_SET); phrase_index.load_text(2, gbkfile); fclose(gbkfile); MemoryChunk * new_chunk = new MemoryChunk; largetable.store(new_chunk); largetable.load(new_chunk); char* linebuf = NULL; size_t size = 0; while( getline(&linebuf, &size, stdin) ){ linebuf[strlen(linebuf)-1] = '\0'; if ( strcmp ( linebuf, "quit" ) == 0) break; FullPinyinParser2 parser; ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); ChewingKeyRestVector key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); parser.parse(options, keys, key_rests, linebuf, strlen(linebuf)); if (0 == keys->len) { fprintf(stderr, "Invalid input.\n"); continue; } guint32 start = record_time(); PhraseIndexRanges ranges; memset(ranges, 0, sizeof(PhraseIndexRanges)); guint8 min_index, max_index; phrase_index.get_sub_phrase_range(min_index, max_index); for (size_t i = min_index; i < max_index; ++i) { ranges[i] = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange)); } for (size_t i = 0; i < bench_times; ++i) { largetable.search(keys->len, (ChewingKey *)keys->data, ranges); } for (size_t i = min_index; i < max_index; ++i) { g_array_set_size(ranges[i], 0); } print_time(start, bench_times); largetable.search(keys->len, (ChewingKey *)keys->data, ranges); for (size_t i = min_index; i < max_index; ++i) { GArray * & range = ranges[i]; if (range) { if (range->len) printf("range items number:%d\n", range->len); for (size_t k = 0; k < range->len; ++k) { PhraseIndexRange * onerange = &g_array_index(range, PhraseIndexRange, k); printf("start:%d\tend:%d\n", onerange->m_range_begin, onerange->m_range_end); PhraseItem item; for ( phrase_token_t token = onerange->m_range_begin; token != onerange->m_range_end; ++token){ phrase_index.get_phrase_item( token, item); /* get phrase string */ gunichar2 buffer[MAX_PHRASE_LENGTH + 1]; item.get_phrase_string(buffer); char * string = g_utf16_to_utf8 ( buffer, item.get_phrase_length(), NULL, NULL, NULL); printf("%s\t", string); g_free(string); ChewingKey chewing_buffer[MAX_PHRASE_LENGTH]; size_t npron = item.get_n_pronunciation(); guint32 freq; for (size_t m = 0; m < npron; ++m){ item.get_nth_pronunciation(m, chewing_buffer, freq); for (size_t n = 0; n < item.get_phrase_length(); ++n){ printf("%s'", chewing_buffer[n].get_pinyin_string()); } printf("\b\t%d\t", freq); } } printf("\n"); } } g_array_set_size(range, 0); } g_array_free(keys, TRUE); g_array_free(key_rests, TRUE); } if (linebuf) free(linebuf); return 0; }
int main(int argc, char * argv[]) { SystemTableInfo system_table_info; bool retval = system_table_info.load("../../data/table.conf"); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } pinyin_option_t options = USE_TONE | PINYIN_INCOMPLETE; ChewingLargeTable largetable(options); FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_table_info(); if (!load_phrase_table(phrase_files, &largetable, NULL, &phrase_index)) exit(ENOENT); MemoryChunk * new_chunk = new MemoryChunk; largetable.store(new_chunk); largetable.load(new_chunk); char* linebuf = NULL; size_t size = 0; ssize_t read; while ((read = getline(&linebuf, &size, stdin)) != -1) { if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } if ( strcmp ( linebuf, "quit" ) == 0) break; FullPinyinParser2 parser; ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); ChewingKeyRestVector key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); parser.parse(options, keys, key_rests, linebuf, strlen(linebuf)); if (0 == keys->len) { fprintf(stderr, "Invalid input.\n"); continue; } guint32 start = record_time(); PhraseIndexRanges ranges; memset(ranges, 0, sizeof(PhraseIndexRanges)); phrase_index.prepare_ranges(ranges); for (size_t i = 0; i < bench_times; ++i) { phrase_index.clear_ranges(ranges); largetable.search(keys->len, (ChewingKey *)keys->data, ranges); } print_time(start, bench_times); phrase_index.clear_ranges(ranges); largetable.search(keys->len, (ChewingKey *)keys->data, ranges); for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { GArray * & range = ranges[i]; if (!range) continue; if (range->len) printf("range items number:%d\n", range->len); for (size_t k = 0; k < range->len; ++k) { PhraseIndexRange * onerange = &g_array_index(range, PhraseIndexRange, k); printf("start:%d\tend:%d\n", onerange->m_range_begin, onerange->m_range_end); PhraseItem item; for ( phrase_token_t token = onerange->m_range_begin; token != onerange->m_range_end; ++token){ phrase_index.get_phrase_item( token, item); /* get phrase string */ ucs4_t buffer[MAX_PHRASE_LENGTH + 1]; item.get_phrase_string(buffer); char * string = g_ucs4_to_utf8 ( buffer, item.get_phrase_length(), NULL, NULL, NULL); printf("%s\t", string); g_free(string); ChewingKey chewing_buffer[MAX_PHRASE_LENGTH]; size_t npron = item.get_n_pronunciation(); guint32 freq; for (size_t m = 0; m < npron; ++m){ item.get_nth_pronunciation(m, chewing_buffer, freq); for (size_t n = 0; n < item.get_phrase_length(); ++n){ gchar * pinyins = chewing_buffer[n].get_pinyin_string(); printf("%s'", pinyins); g_free(pinyins); } printf("\b\t%d\t", freq); } } printf("\n"); } g_array_set_size(range, 0); } phrase_index.destroy_ranges(ranges); g_array_free(keys, TRUE); g_array_free(key_rests, TRUE); } if (linebuf) free(linebuf); /* mask out all index items. */ largetable.mask_out(0x0, 0x0); return 0; }