int PinyinPhraseLib::find_phrases (PhraseVector &vec, const char *keys, bool noshorter, bool nolonger) { PinyinParsedKeyVector pykeys; PinyinDefaultParser parser; parser.parse (*m_validator, pykeys, keys); return find_phrases (vec, pykeys, noshorter, nolonger); }
int main( int argc, char * argv[]){ PinyinCustomSettings custom; PinyinLargeTable largetable(&custom); FacadePhraseIndex phrase_index; FILE * gbfile = fopen("../../data/gb_char.table", "r"); if ( gbfile == NULL ) { fprintf(stderr, "open gb_char.table failed!\n"); exit(ENOENT); } largetable.load_text(gbfile); fseek(gbfile, 0L, SEEK_SET); phrase_index.load_text(1, gbfile); fclose(gbfile); FILE * gbkfile = fopen("../../data/gbk_char.table","r"); if ( gbkfile == NULL ) { fprintf(stderr, "open gb_char.table failed!\n"); exit(ENOENT); } largetable.load_text(gbkfile); fseek(gbkfile, 0L, SEEK_SET); phrase_index.load_text(2, gbkfile); fclose(gbkfile); MemoryChunk* new_chunk = new MemoryChunk; largetable.store(new_chunk); largetable.load(new_chunk); char* linebuf = NULL; size_t size = 0; while( getline(&linebuf, &size, stdin) ){ linebuf[strlen(linebuf)-1] = '\0'; if ( strcmp ( linebuf, "quit" ) == 0) break; PinyinDefaultParser parser; NullPinyinValidator validator; PinyinKeyVector keys; PinyinKeyPosVector poses; keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); parser.parse(validator, keys, poses, linebuf); guint32 start = record_time(); PhraseIndexRanges ranges; for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ ranges[i] = g_array_new(FALSE, FALSE, sizeof (PhraseIndexRange)); } for ( size_t i = 0 ; i < bench_times; ++i){ largetable.search(keys->len, (PinyinKey *)keys->data, ranges); } for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ GArray * range = ranges[i]; g_array_set_size( range, 0); } print_time(start, bench_times); largetable.search(keys->len, (PinyinKey *)keys->data, ranges); for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ GArray * range = ranges[i]; if ( range ){ for (size_t k = 0; k < range->len; ++k){ PhraseIndexRange* onerange = &g_array_index(range, PhraseIndexRange, k); printf("start:%d\tend:%d\n", onerange->m_range_begin, onerange->m_range_end); PhraseItem item; for ( phrase_token_t token = onerange->m_range_begin; token != onerange->m_range_end; ++token){ phrase_index.get_phrase_item( token, item); gunichar2 bufstr[1024]; item.get_phrase_string(bufstr); char * string = g_utf16_to_utf8 ( bufstr, item.get_phrase_length(), NULL, NULL, NULL); printf("%s\t", string); g_free(string); PinyinKey pinyin_buffer[1024]; size_t npron = item.get_n_pronunciation(); guint32 freq; for ( size_t n = 0; n < npron; ++n){ item.get_nth_pronunciation(n, pinyin_buffer, freq); for ( size_t o = 0; o < item.get_phrase_length(); ++o){ printf("%s'", pinyin_buffer[o].get_key_string()); } printf("\b\t%d\t", freq); } printf("\n"); } } if ( range->len) printf("range items number:%d\n", range->len); } g_array_set_size( range, 0); } g_array_free(keys, TRUE); g_array_free(poses, TRUE); } if (linebuf) free(linebuf); return 0; }
void feed_line (const char * phrase, const char * pinyin, const guint32 freq){ phrase_item * new_phrase_ptr = (phrase_item *) malloc( sizeof(phrase_item)); new_phrase_ptr->length = g_utf8_strlen(phrase, -1); /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp * where is the code which I don't want to touch. :-) */ if (new_phrase_ptr->length >= MAX_PHRASE_LENGTH ) { fprintf(stderr, "too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq); free(new_phrase_ptr); return; } new_phrase_ptr->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL); PinyinDefaultParser parser; NullPinyinValidator validator; PinyinKeyVector keys; PinyinKeyPosVector poses; keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); parser.parse(validator, keys, poses, pinyin); GArray * array = (GArray *)g_tree_lookup(g_pinyin_tree, new_phrase_ptr); pinyin_and_freq_item value_item; value_item.pinyin = keys; value_item.freq = freq; if(new_phrase_ptr->length != value_item.pinyin->len){ fprintf(stderr, "error:phrase:%s\tpinyin:%s\n", phrase, pinyin); return; } if ( array == NULL){ array = g_array_new(FALSE, TRUE, sizeof(pinyin_and_freq_item)); g_array_append_val(array, value_item); g_tree_insert(g_pinyin_tree, new_phrase_ptr, array); return; } bool found = false; for ( size_t i = 0; i < array->len ; ++i){ pinyin_and_freq_item * old_value_item = &g_array_index(array, pinyin_and_freq_item, i); int result = pinyin_exact_compare((PinyinKey *)value_item.pinyin->data, (PinyinKey *)old_value_item->pinyin->data , value_item.pinyin->len); if ( result == 0 ){ printf("Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n", phrase, pinyin, freq); old_value_item->freq += freq; found = true; } } g_array_free(poses, TRUE); if ( !found ){ g_array_append_val(array, value_item); g_tree_insert(g_pinyin_tree, new_phrase_ptr, array); }else g_array_free(keys, TRUE); free(new_phrase_ptr); //g_array_free(keys, TRUE); }
int main( int argc, char * argv[]){ PinyinCustomSettings custom; PinyinLargeTable largetable(&custom); FILE * gbfile = fopen("../../data/gb_char.table", "r"); if ( gbfile == NULL ) { fprintf(stderr, "open gb_char.table failed!\n"); exit(ENOENT); } largetable.load_text(gbfile); fclose(gbfile); FILE * gbkfile = fopen("../../data/gbk_char.table","r"); if ( gbkfile == NULL ) { fprintf(stderr, "open gbk_char.table failed!\n"); exit(ENOENT); } largetable.load_text(gbkfile); fclose(gbkfile); MemoryChunk* new_chunk = new MemoryChunk; largetable.store(new_chunk); largetable.load(new_chunk); char* linebuf = NULL; size_t size = 0; while( getline(&linebuf, &size, stdin) ){ linebuf[strlen(linebuf)-1] = '\0'; if ( strcmp ( linebuf, "quit" ) == 0) break; PinyinDefaultParser parser; NullPinyinValidator validator; PinyinKeyVector keys; PinyinKeyPosVector poses; keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); parser.parse(validator, keys, poses, linebuf); guint32 start = record_time(); PhraseIndexRanges ranges; for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ ranges[i] = g_array_new(FALSE, FALSE, sizeof (PhraseIndexRange)); } for ( size_t i = 0 ; i < bench_times; ++i){ largetable.search(keys->len, (PinyinKey *)keys->data, ranges); } for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ GArray * range = ranges[i]; g_array_set_size( range, 0); } print_time(start, bench_times); largetable.search(keys->len, (PinyinKey *)keys->data, ranges); for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){ GArray * range = ranges[i]; if (range) { if (range->len) printf("range items number:%d\n", range->len); for (size_t k = 0; k < range->len; ++k) { PhraseIndexRange * onerange = &g_array_index(range, PhraseIndexRange, k); printf("start:%d\tend:%d\n", onerange->m_range_begin, onerange->m_range_end); } } g_array_set_size(range, 0); } g_array_free(keys, TRUE); g_array_free(poses, TRUE); } if (linebuf) free(linebuf); return 0; }