Пример #1
0
int
PinyinPhraseLib::find_phrases (PhraseVector &vec,
							   const char *keys,
							   bool noshorter,
							   bool nolonger)
{
	PinyinParsedKeyVector pykeys;
	PinyinDefaultParser parser;

	parser.parse (*m_validator, pykeys, keys);

	return find_phrases (vec, pykeys, noshorter, nolonger);
}
Пример #2
0
int main( int argc, char * argv[]){

    PinyinCustomSettings custom;
    PinyinLargeTable largetable(&custom);

    FacadePhraseIndex phrase_index;

    FILE * gbfile = fopen("../../data/gb_char.table", "r");
    if ( gbfile == NULL ) {
	fprintf(stderr, "open gb_char.table failed!\n");
	exit(ENOENT);
    }

    largetable.load_text(gbfile);
    fseek(gbfile, 0L, SEEK_SET);
    phrase_index.load_text(1, gbfile);
    fclose(gbfile);

    FILE * gbkfile = fopen("../../data/gbk_char.table","r");
    if ( gbkfile == NULL ) {
	fprintf(stderr, "open gb_char.table failed!\n");
	exit(ENOENT);
    }
    
    largetable.load_text(gbkfile);
    fseek(gbkfile, 0L, SEEK_SET);
    phrase_index.load_text(2, gbkfile);
    fclose(gbkfile);

    MemoryChunk* new_chunk = new MemoryChunk;
    largetable.store(new_chunk);
    largetable.load(new_chunk);
    
    char* linebuf = NULL;
    size_t size = 0;
    while( getline(&linebuf, &size, stdin) ){
        linebuf[strlen(linebuf)-1] = '\0';
	if ( strcmp ( linebuf, "quit" ) == 0)
	    break;
	
	PinyinDefaultParser parser;
	NullPinyinValidator validator;
	PinyinKeyVector keys;
	PinyinKeyPosVector poses;
	
	keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
	poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
	parser.parse(validator, keys, poses, linebuf);
	
	guint32 start = record_time();

	PhraseIndexRanges ranges;
	for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){
	    ranges[i] = g_array_new(FALSE, FALSE, sizeof (PhraseIndexRange));
	}
	for ( size_t i = 0 ; i < bench_times; ++i){
	    largetable.search(keys->len, (PinyinKey *)keys->data, ranges);
	}
       
	for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){
	    GArray * range = ranges[i];
	    g_array_set_size( range, 0);
	}
	print_time(start, bench_times);

	largetable.search(keys->len, (PinyinKey *)keys->data, ranges);
	for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){
	    GArray * range = ranges[i];
	    if ( range ){
		for (size_t k = 0; k < range->len; ++k){
		    PhraseIndexRange* onerange = &g_array_index(range, PhraseIndexRange, k);
		    printf("start:%d\tend:%d\n", onerange->m_range_begin, onerange->m_range_end); 
		    PhraseItem item;
		    for ( phrase_token_t token = onerange->m_range_begin; token != onerange->m_range_end; ++token){
			phrase_index.get_phrase_item( token, item);
			gunichar2 bufstr[1024];
			item.get_phrase_string(bufstr);
			char * string = g_utf16_to_utf8
			    ( bufstr, item.get_phrase_length(), 
			      NULL, NULL, NULL);
			printf("%s\t", string);
			g_free(string);
			PinyinKey pinyin_buffer[1024];
			size_t npron = item.get_n_pronunciation();
			guint32 freq;
			for ( size_t n = 0; n < npron; ++n){
			    item.get_nth_pronunciation(n, pinyin_buffer, freq);
			    for ( size_t o = 0; o < item.get_phrase_length(); ++o){
				printf("%s'", pinyin_buffer[o].get_key_string());
			    }
			    printf("\b\t%d\t", freq);
			}
			printf("\n");
		    }
		}
		if ( range->len)
		    printf("range items number:%d\n", range->len);
	    }
	    g_array_set_size( range, 0);
	}

	g_array_free(keys, TRUE);
	g_array_free(poses, TRUE);
    }
    if (linebuf)
        free(linebuf);
    return 0;
}
Пример #3
0
void feed_line (const char * phrase, const char * pinyin, const guint32 freq){
    phrase_item * new_phrase_ptr = (phrase_item *)
	malloc( sizeof(phrase_item));     
    new_phrase_ptr->length = g_utf8_strlen(phrase, -1);
	/* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp
	 *	where is the code which I don't want to touch. :-)
	 */
	if (new_phrase_ptr->length >= MAX_PHRASE_LENGTH ) {
		fprintf(stderr, "too long phrase:%s\t%s\t%d\n", phrase,
			pinyin, freq);
		free(new_phrase_ptr);
		return;
	}
    new_phrase_ptr->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
    
    PinyinDefaultParser parser;
    NullPinyinValidator validator;
    PinyinKeyVector keys;
    PinyinKeyPosVector poses;
    
    keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
    poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
    parser.parse(validator, keys, poses, pinyin);

    GArray * array = (GArray *)g_tree_lookup(g_pinyin_tree, new_phrase_ptr);

    pinyin_and_freq_item value_item;
    value_item.pinyin = keys;
    value_item.freq = freq;
    
    if(new_phrase_ptr->length != value_item.pinyin->len){
	fprintf(stderr, "error:phrase:%s\tpinyin:%s\n", phrase, pinyin);
	return;
    }

    if ( array == NULL){
	array = g_array_new(FALSE, TRUE, sizeof(pinyin_and_freq_item));
	g_array_append_val(array, value_item);
	g_tree_insert(g_pinyin_tree, new_phrase_ptr, array);
	return;
    }
    bool found = false;
    for ( size_t i = 0; i < array->len ; ++i){
	pinyin_and_freq_item * old_value_item = &g_array_index(array, pinyin_and_freq_item, i);
	int result = pinyin_exact_compare((PinyinKey *)value_item.pinyin->data, 
					  (PinyinKey *)old_value_item->pinyin->data , value_item.pinyin->len);
	if ( result == 0 ){
	    printf("Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n", 
		   phrase, pinyin, freq);
	    old_value_item->freq += freq;
	    found = true;
	}
    }

    g_array_free(poses, TRUE);
    
    if ( !found ){
	g_array_append_val(array, value_item);
	g_tree_insert(g_pinyin_tree, new_phrase_ptr, array);
    }else
	g_array_free(keys, TRUE);

    free(new_phrase_ptr);
    //g_array_free(keys, TRUE);
}
Пример #4
0
int main( int argc, char * argv[]){

    PinyinCustomSettings custom;
    PinyinLargeTable largetable(&custom);

    FILE * gbfile = fopen("../../data/gb_char.table", "r");
    if ( gbfile == NULL ) {
	fprintf(stderr, "open gb_char.table failed!\n");
	exit(ENOENT);
    }

    largetable.load_text(gbfile);
    fclose(gbfile);

    FILE * gbkfile = fopen("../../data/gbk_char.table","r");
    if ( gbkfile == NULL ) {
	fprintf(stderr, "open gbk_char.table failed!\n");
	exit(ENOENT);
    }
    
    largetable.load_text(gbkfile);
    fclose(gbkfile);

    MemoryChunk* new_chunk = new MemoryChunk;
    largetable.store(new_chunk);
    largetable.load(new_chunk);
    
    char* linebuf = NULL;
    size_t size = 0;
    while( getline(&linebuf, &size, stdin) ){
        linebuf[strlen(linebuf)-1] = '\0';
	if ( strcmp ( linebuf, "quit" ) == 0)
	    break;
	
	PinyinDefaultParser parser;
	NullPinyinValidator validator;
	PinyinKeyVector keys;
	PinyinKeyPosVector poses;
	
	keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
	poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
	parser.parse(validator, keys, poses, linebuf);
	
	guint32 start = record_time();

	PhraseIndexRanges ranges;
	for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){
	    ranges[i] = g_array_new(FALSE, FALSE, sizeof (PhraseIndexRange));
	}
	for ( size_t i = 0 ; i < bench_times; ++i){
	    largetable.search(keys->len, (PinyinKey *)keys->data, ranges);
	}
       
	for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){
	    GArray * range = ranges[i];
	    g_array_set_size( range, 0);
	}
	print_time(start, bench_times);

	largetable.search(keys->len, (PinyinKey *)keys->data, ranges);
	for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){
	    GArray * range = ranges[i];
            if (range) {
                if (range->len)
                    printf("range items number:%d\n", range->len);

                for (size_t k = 0; k < range->len; ++k) {
                    PhraseIndexRange * onerange =
                        &g_array_index(range, PhraseIndexRange, k);
                    printf("start:%d\tend:%d\n", onerange->m_range_begin,
                           onerange->m_range_end);

                }
            }

            g_array_set_size(range, 0);
        }

	g_array_free(keys, TRUE);
	g_array_free(poses, TRUE);
    }
    if (linebuf)
        free(linebuf);
    return 0;
}