bool convert_to_utf8(FacadePhraseIndex * phrase_index, MatchResults match_results, /* in */ const char * delimiter, /* out */ char * & result_string){ //init variables if ( NULL == delimiter ) delimiter = ""; result_string = NULL; PhraseItem item; for ( size_t i = 0; i < match_results->len; ++i ){ phrase_token_t * token = &g_array_index (match_results, phrase_token_t, i); if ( null_token == *token ) continue; phrase_index->get_phrase_item(*token, item); ucs4_t buffer[MAX_PHRASE_LENGTH]; item.get_phrase_string(buffer); guint8 length = item.get_phrase_length(); gchar * phrase = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL); char * tmp = result_string; if ( NULL == result_string ) result_string = g_strdup(phrase); else result_string = g_strconcat(result_string, delimiter, phrase, NULL); g_free(tmp); g_free(phrase); } return true; }
bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) { fprintf(output, "\\1-gram\n"); for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; i++) { PhraseIndexRange range; int result = phrase_index->get_range(i, range); if (ERROR_OK != result ) continue; PhraseItem item; for (phrase_token_t token = range.m_range_begin; token < range.m_range_end; token++) { int result = phrase_index->get_phrase_item(token, item); if ( result == ERROR_NO_ITEM ) continue; assert( result == ERROR_OK); size_t freq = item.get_unigram_frequency(); if ( 0 == freq ) continue; char * phrase = taglib_token_to_string(phrase_index, token); if ( phrase ) fprintf(output, "\\item %d %s count %ld\n", token, phrase, freq); g_free(phrase); } } return true; }
parameter_t compute_interpolation(SingleGram * deleted_bigram, FacadePhraseIndex * unigram, SingleGram * bigram){ bool success; parameter_t lambda = 0, next_lambda = 0.6; parameter_t epsilon = 0.001; while ( fabs(lambda - next_lambda) > epsilon){ lambda = next_lambda; next_lambda = 0; guint32 table_num = 0; parameter_t numerator = 0; parameter_t part_of_denominator = 0; BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount)); deleted_bigram->retrieve_all(array); for ( int i = 0; i < array->len; ++i){ BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, i); //get the phrase token phrase_token_t token = item->m_token; guint32 deleted_count = item->m_count; { guint32 freq = 0; parameter_t elem_poss = 0; if (bigram && bigram->get_freq(token, freq)){ guint32 total_freq; assert(bigram->get_total_freq(total_freq)); assert(0 != total_freq); elem_poss = freq / (parameter_t) total_freq; } numerator = lambda * elem_poss; } { guint32 freq = 0; parameter_t elem_poss = 0; PhraseItem item; if (!unigram->get_phrase_item(token, item)){ guint32 freq = item.get_unigram_frequency(); guint32 total_freq = unigram->get_phrase_index_total_freq(); elem_poss = freq / (parameter_t)total_freq; } part_of_denominator = (1 - lambda) * elem_poss; } if (0 == (numerator + part_of_denominator)) continue; next_lambda += deleted_count * (numerator / (numerator + part_of_denominator)); } assert(deleted_bigram->get_total_freq(table_num)); next_lambda /= table_num; g_array_free(array, TRUE); } lambda = next_lambda; return lambda; }
bool get_possible_pinyin(FacadePhraseIndex * phrase_index, TokenVector tokens, ChewingKeyVector keys){ ChewingKey buffer[MAX_PHRASE_LENGTH]; size_t key_index; guint32 max_freq; guint32 freq; g_array_set_size(keys, 0); for (size_t i = 0; i < tokens->len; ++i){ phrase_token_t * token = &g_array_index(tokens, phrase_token_t, i); PhraseItem item; phrase_index->get_phrase_item(*token, item); key_index = 0; max_freq = 0; for ( size_t m = 0; m < item.get_n_pronunciation(); ++m ) { freq = 0; assert(item.get_nth_pronunciation(m, buffer, freq)); if ( freq > max_freq ) { key_index = m; max_freq = freq; } } assert(item.get_nth_pronunciation(key_index, buffer, freq)); assert(max_freq == freq); guint8 len = item.get_phrase_length(); g_array_append_vals(keys, buffer, len); } return true; }
int main( int argc, char * argv[]){ PinyinCustomSettings custom; PinyinLargeTable largetable(&custom); FacadePhraseIndex phrase_index; FILE * gbfile = fopen("../../data/gb_char.table", "r"); if ( gbfile == NULL ) { fprintf(stderr, "open gb_char.table failed!\n"); exit(ENOENT); } largetable.load_text(gbfile); fseek(gbfile, 0L, SEEK_SET); phrase_index.load_text(1, gbfile); fclose(gbfile); FILE * gbkfile = fopen("../../data/gbk_char.table","r"); if ( gbkfile == NULL ) { fprintf(stderr, "open gb_char.table failed!\n"); exit(ENOENT); } largetable.load_text(gbkfile); fseek(gbkfile, 0L, SEEK_SET); phrase_index.load_text(2, gbkfile); fclose(gbkfile); MemoryChunk* new_chunk = new MemoryChunk; largetable.store(new_chunk); largetable.load(new_chunk); char* linebuf = NULL; size_t size = 0; while( getline(&linebuf, &size, stdin) ){ linebuf[strlen(linebuf)-1] = '\0'; if ( strcmp ( linebuf, "quit" ) == 0) break; PinyinDefaultParser parser; NullPinyinValidator validator; PinyinKeyVector keys; PinyinKeyPosVector poses; keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); parser.parse(validator, keys, poses, linebuf); guint32 start = record_time(); PhraseIndexRanges ranges; for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ ranges[i] = g_array_new(FALSE, FALSE, sizeof (PhraseIndexRange)); } for ( size_t i = 0 ; i < bench_times; ++i){ largetable.search(keys->len, (PinyinKey *)keys->data, ranges); } for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ GArray * range = ranges[i]; g_array_set_size( range, 0); } print_time(start, bench_times); largetable.search(keys->len, (PinyinKey *)keys->data, ranges); for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ GArray * range = ranges[i]; if ( range ){ for (size_t k = 0; k < range->len; ++k){ PhraseIndexRange* onerange = &g_array_index(range, PhraseIndexRange, k); printf("start:%d\tend:%d\n", onerange->m_range_begin, onerange->m_range_end); PhraseItem item; for ( phrase_token_t token = onerange->m_range_begin; token != onerange->m_range_end; ++token){ phrase_index.get_phrase_item( token, item); gunichar2 bufstr[1024]; item.get_phrase_string(bufstr); char * string = g_utf16_to_utf8 ( bufstr, item.get_phrase_length(), NULL, NULL, NULL); printf("%s\t", string); g_free(string); PinyinKey pinyin_buffer[1024]; size_t npron = item.get_n_pronunciation(); guint32 freq; for ( size_t n = 0; n < npron; ++n){ item.get_nth_pronunciation(n, pinyin_buffer, freq); for ( size_t o = 0; o < item.get_phrase_length(); ++o){ printf("%s'", pinyin_buffer[o].get_key_string()); } printf("\b\t%d\t", freq); } printf("\n"); } } if ( range->len) printf("range items number:%d\n", range->len); } g_array_set_size( range, 0); } g_array_free(keys, TRUE); g_array_free(poses, TRUE); } if (linebuf) free(linebuf); return 0; }
int main(int argc, char * argv[]) { pinyin_option_t options = USE_TONE | PINYIN_INCOMPLETE; ChewingLargeTable largetable(options); FacadePhraseIndex phrase_index; FILE * gbfile = fopen("../../data/gb_char.table", "r"); if (NULL == gbfile) { fprintf(stderr, "open gb_char.table failed!\n"); exit(ENOENT); } largetable.load_text(gbfile); fseek(gbfile, 0L, SEEK_SET); phrase_index.load_text(1, gbfile); fclose(gbfile); FILE * gbkfile = fopen("../../data/gbk_char.table", "r"); if (NULL == gbkfile) { fprintf(stderr, "open gbk_char.table failed!\n"); exit(ENOENT); } largetable.load_text(gbkfile); fseek(gbkfile, 0L, SEEK_SET); phrase_index.load_text(2, gbkfile); fclose(gbkfile); MemoryChunk * new_chunk = new MemoryChunk; largetable.store(new_chunk); largetable.load(new_chunk); char* linebuf = NULL; size_t size = 0; while( getline(&linebuf, &size, stdin) ){ linebuf[strlen(linebuf)-1] = '\0'; if ( strcmp ( linebuf, "quit" ) == 0) break; FullPinyinParser2 parser; ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); ChewingKeyRestVector key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); parser.parse(options, keys, key_rests, linebuf, strlen(linebuf)); if (0 == keys->len) { fprintf(stderr, "Invalid input.\n"); continue; } guint32 start = record_time(); PhraseIndexRanges ranges; memset(ranges, 0, sizeof(PhraseIndexRanges)); guint8 min_index, max_index; phrase_index.get_sub_phrase_range(min_index, max_index); for (size_t i = min_index; i < max_index; ++i) { ranges[i] = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange)); } for (size_t i = 0; i < bench_times; ++i) { largetable.search(keys->len, (ChewingKey *)keys->data, ranges); } for (size_t i = min_index; i < max_index; ++i) { g_array_set_size(ranges[i], 0); } print_time(start, bench_times); largetable.search(keys->len, (ChewingKey *)keys->data, ranges); for (size_t i = min_index; i < max_index; ++i) { GArray * & range = ranges[i]; if (range) { if (range->len) printf("range items number:%d\n", range->len); for (size_t k = 0; k < range->len; ++k) { PhraseIndexRange * onerange = &g_array_index(range, PhraseIndexRange, k); printf("start:%d\tend:%d\n", onerange->m_range_begin, onerange->m_range_end); PhraseItem item; for ( phrase_token_t token = onerange->m_range_begin; token != onerange->m_range_end; ++token){ phrase_index.get_phrase_item( token, item); /* get phrase string */ gunichar2 buffer[MAX_PHRASE_LENGTH + 1]; item.get_phrase_string(buffer); char * string = g_utf16_to_utf8 ( buffer, item.get_phrase_length(), NULL, NULL, NULL); printf("%s\t", string); g_free(string); ChewingKey chewing_buffer[MAX_PHRASE_LENGTH]; size_t npron = item.get_n_pronunciation(); guint32 freq; for (size_t m = 0; m < npron; ++m){ item.get_nth_pronunciation(m, chewing_buffer, freq); for (size_t n = 0; n < item.get_phrase_length(); ++n){ printf("%s'", chewing_buffer[n].get_pinyin_string()); } printf("\b\t%d\t", freq); } } printf("\n"); } } g_array_set_size(range, 0); } g_array_free(keys, TRUE); g_array_free(key_rests, TRUE); } if (linebuf) free(linebuf); return 0; }
int main(int argc, char * argv[]) { SystemTableInfo system_table_info; bool retval = system_table_info.load("../../data/table.conf"); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } pinyin_option_t options = USE_TONE | PINYIN_INCOMPLETE; ChewingLargeTable largetable(options); FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_table_info(); if (!load_phrase_table(phrase_files, &largetable, NULL, &phrase_index)) exit(ENOENT); MemoryChunk * new_chunk = new MemoryChunk; largetable.store(new_chunk); largetable.load(new_chunk); char* linebuf = NULL; size_t size = 0; ssize_t read; while ((read = getline(&linebuf, &size, stdin)) != -1) { if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } if ( strcmp ( linebuf, "quit" ) == 0) break; FullPinyinParser2 parser; ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); ChewingKeyRestVector key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); parser.parse(options, keys, key_rests, linebuf, strlen(linebuf)); if (0 == keys->len) { fprintf(stderr, "Invalid input.\n"); continue; } guint32 start = record_time(); PhraseIndexRanges ranges; memset(ranges, 0, sizeof(PhraseIndexRanges)); phrase_index.prepare_ranges(ranges); for (size_t i = 0; i < bench_times; ++i) { phrase_index.clear_ranges(ranges); largetable.search(keys->len, (ChewingKey *)keys->data, ranges); } print_time(start, bench_times); phrase_index.clear_ranges(ranges); largetable.search(keys->len, (ChewingKey *)keys->data, ranges); for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { GArray * & range = ranges[i]; if (!range) continue; if (range->len) printf("range items number:%d\n", range->len); for (size_t k = 0; k < range->len; ++k) { PhraseIndexRange * onerange = &g_array_index(range, PhraseIndexRange, k); printf("start:%d\tend:%d\n", onerange->m_range_begin, onerange->m_range_end); PhraseItem item; for ( phrase_token_t token = onerange->m_range_begin; token != onerange->m_range_end; ++token){ phrase_index.get_phrase_item( token, item); /* get phrase string */ ucs4_t buffer[MAX_PHRASE_LENGTH + 1]; item.get_phrase_string(buffer); char * string = g_ucs4_to_utf8 ( buffer, item.get_phrase_length(), NULL, NULL, NULL); printf("%s\t", string); g_free(string); ChewingKey chewing_buffer[MAX_PHRASE_LENGTH]; size_t npron = item.get_n_pronunciation(); guint32 freq; for (size_t m = 0; m < npron; ++m){ item.get_nth_pronunciation(m, chewing_buffer, freq); for (size_t n = 0; n < item.get_phrase_length(); ++n){ gchar * pinyins = chewing_buffer[n].get_pinyin_string(); printf("%s'", pinyins); g_free(pinyins); } printf("\b\t%d\t", freq); } } printf("\n"); } g_array_set_size(range, 0); } phrase_index.destroy_ranges(ranges); g_array_free(keys, TRUE); g_array_free(key_rests, TRUE); } if (linebuf) free(linebuf); /* mask out all index items. */ largetable.mask_out(0x0, 0x0); return 0; }