Beispiel #1
0
bool convert_to_utf8(FacadePhraseIndex * phrase_index,
                     MatchResults match_results,
                     /* in */ const char * delimiter,
                     /* out */ char * & result_string){
    //init variables
    if ( NULL == delimiter )
        delimiter = "";
    result_string = NULL;

    PhraseItem item;

    for ( size_t i = 0; i < match_results->len; ++i ){
        phrase_token_t * token = &g_array_index
            (match_results, phrase_token_t, i);
        if ( null_token == *token )
            continue;

        phrase_index->get_phrase_item(*token, item);
        ucs4_t buffer[MAX_PHRASE_LENGTH];
        item.get_phrase_string(buffer);

        guint8 length = item.get_phrase_length();
        gchar * phrase = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
        char * tmp = result_string;
        if ( NULL == result_string )
            result_string = g_strdup(phrase);
        else
            result_string = g_strconcat(result_string, delimiter, phrase, NULL);
        g_free(tmp); g_free(phrase);
    }
    return true;
}
bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) {
    fprintf(output, "\\1-gram\n");
    for ( size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; i++) {

        PhraseIndexRange range;
        int result = phrase_index->get_range(i, range);
        if (ERROR_OK != result )
            continue;

        PhraseItem item;
        for (phrase_token_t token = range.m_range_begin;
              token < range.m_range_end; token++) {
            int result = phrase_index->get_phrase_item(token, item);

            if ( result == ERROR_NO_ITEM )
                continue;
            assert( result == ERROR_OK);

            size_t freq = item.get_unigram_frequency();
            if ( 0 == freq )
                continue;
            char * phrase = taglib_token_to_string(phrase_index, token);
            if ( phrase )
                fprintf(output, "\\item %d %s count %ld\n", token, phrase, freq);

            g_free(phrase);
        }
    }
    return true;
}
parameter_t compute_interpolation(SingleGram * deleted_bigram,
				  FacadePhraseIndex * unigram,
				  SingleGram * bigram){
    bool success;
    parameter_t lambda = 0, next_lambda = 0.6;
    parameter_t epsilon = 0.001;
    
    while ( fabs(lambda - next_lambda) > epsilon){
	lambda = next_lambda;
	next_lambda = 0;
	guint32 table_num = 0;
	parameter_t numerator = 0;
	parameter_t part_of_denominator = 0;
	
	BigramPhraseWithCountArray array = g_array_new(FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
	deleted_bigram->retrieve_all(array);

	for ( int i = 0; i < array->len; ++i){
	    BigramPhraseItemWithCount * item = &g_array_index(array, BigramPhraseItemWithCount, i);
	    //get the phrase token
	    phrase_token_t token = item->m_token;
	    guint32 deleted_count = item->m_count;

	    {
		guint32 freq = 0;
		parameter_t elem_poss = 0;
		if (bigram && bigram->get_freq(token, freq)){
		    guint32 total_freq;
		    assert(bigram->get_total_freq(total_freq));
		    assert(0 != total_freq);
		    elem_poss = freq / (parameter_t) total_freq;
		}
		numerator = lambda * elem_poss;
	    }

	    {
		guint32 freq = 0;
		parameter_t elem_poss = 0;
		PhraseItem item;
		if (!unigram->get_phrase_item(token, item)){
		    guint32 freq = item.get_unigram_frequency();
		    guint32 total_freq = unigram->get_phrase_index_total_freq();
		    elem_poss = freq / (parameter_t)total_freq;
		}
		part_of_denominator = (1 - lambda) * elem_poss;
	    }
	    
	    if (0 == (numerator + part_of_denominator))
		continue;
	    
	    next_lambda += deleted_count * (numerator / (numerator + part_of_denominator));
	}
	assert(deleted_bigram->get_total_freq(table_num));
	next_lambda /= table_num;

	g_array_free(array, TRUE);
    }
    lambda = next_lambda;
    return lambda;
}
bool get_possible_pinyin(FacadePhraseIndex * phrase_index,
                         TokenVector tokens, ChewingKeyVector keys){
    ChewingKey buffer[MAX_PHRASE_LENGTH];
    size_t key_index; guint32 max_freq;
    guint32 freq;
    g_array_set_size(keys, 0);

    for (size_t i = 0; i < tokens->len; ++i){
        phrase_token_t * token = &g_array_index(tokens, phrase_token_t, i);
        PhraseItem item;
        phrase_index->get_phrase_item(*token, item);
        key_index = 0; max_freq = 0;
        for ( size_t m = 0; m < item.get_n_pronunciation(); ++m ) {
            freq = 0;
            assert(item.get_nth_pronunciation(m, buffer, freq));
            if ( freq > max_freq ) {
                key_index = m;
                max_freq = freq;
            }
        }

        assert(item.get_nth_pronunciation(key_index, buffer, freq));
        assert(max_freq == freq);
        guint8 len = item.get_phrase_length();
        g_array_append_vals(keys, buffer, len);
    }
    return true;
}
int main( int argc, char * argv[]){

    PinyinCustomSettings custom;
    PinyinLargeTable largetable(&custom);

    FacadePhraseIndex phrase_index;

    FILE * gbfile = fopen("../../data/gb_char.table", "r");
    if ( gbfile == NULL ) {
	fprintf(stderr, "open gb_char.table failed!\n");
	exit(ENOENT);
    }

    largetable.load_text(gbfile);
    fseek(gbfile, 0L, SEEK_SET);
    phrase_index.load_text(1, gbfile);
    fclose(gbfile);

    FILE * gbkfile = fopen("../../data/gbk_char.table","r");
    if ( gbkfile == NULL ) {
	fprintf(stderr, "open gb_char.table failed!\n");
	exit(ENOENT);
    }
    
    largetable.load_text(gbkfile);
    fseek(gbkfile, 0L, SEEK_SET);
    phrase_index.load_text(2, gbkfile);
    fclose(gbkfile);

    MemoryChunk* new_chunk = new MemoryChunk;
    largetable.store(new_chunk);
    largetable.load(new_chunk);
    
    char* linebuf = NULL;
    size_t size = 0;
    while( getline(&linebuf, &size, stdin) ){
        linebuf[strlen(linebuf)-1] = '\0';
	if ( strcmp ( linebuf, "quit" ) == 0)
	    break;
	
	PinyinDefaultParser parser;
	NullPinyinValidator validator;
	PinyinKeyVector keys;
	PinyinKeyPosVector poses;
	
	keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
	poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
	parser.parse(validator, keys, poses, linebuf);
	
	guint32 start = record_time();

	PhraseIndexRanges ranges;
	for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){
	    ranges[i] = g_array_new(FALSE, FALSE, sizeof (PhraseIndexRange));
	}
	for ( size_t i = 0 ; i < bench_times; ++i){
	    largetable.search(keys->len, (PinyinKey *)keys->data, ranges);
	}
       
	for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){
	    GArray * range = ranges[i];
	    g_array_set_size( range, 0);
	}
	print_time(start, bench_times);

	largetable.search(keys->len, (PinyinKey *)keys->data, ranges);
	for( size_t i = 0 ; i < PHRASE_INDEX_LIBRARY_COUNT ; ++i){
	    GArray * range = ranges[i];
	    if ( range ){
		for (size_t k = 0; k < range->len; ++k){
		    PhraseIndexRange* onerange = &g_array_index(range, PhraseIndexRange, k);
		    printf("start:%d\tend:%d\n", onerange->m_range_begin, onerange->m_range_end); 
		    PhraseItem item;
		    for ( phrase_token_t token = onerange->m_range_begin; token != onerange->m_range_end; ++token){
			phrase_index.get_phrase_item( token, item);
			gunichar2 bufstr[1024];
			item.get_phrase_string(bufstr);
			char * string = g_utf16_to_utf8
			    ( bufstr, item.get_phrase_length(), 
			      NULL, NULL, NULL);
			printf("%s\t", string);
			g_free(string);
			PinyinKey pinyin_buffer[1024];
			size_t npron = item.get_n_pronunciation();
			guint32 freq;
			for ( size_t n = 0; n < npron; ++n){
			    item.get_nth_pronunciation(n, pinyin_buffer, freq);
			    for ( size_t o = 0; o < item.get_phrase_length(); ++o){
				printf("%s'", pinyin_buffer[o].get_key_string());
			    }
			    printf("\b\t%d\t", freq);
			}
			printf("\n");
		    }
		}
		if ( range->len)
		    printf("range items number:%d\n", range->len);
	    }
	    g_array_set_size( range, 0);
	}

	g_array_free(keys, TRUE);
	g_array_free(poses, TRUE);
    }
    if (linebuf)
        free(linebuf);
    return 0;
}
int main(int argc, char * argv[]) {
    pinyin_option_t options = USE_TONE | PINYIN_INCOMPLETE;
    ChewingLargeTable largetable(options);
    FacadePhraseIndex phrase_index;

    FILE * gbfile = fopen("../../data/gb_char.table", "r");
    if (NULL == gbfile) {
	fprintf(stderr, "open gb_char.table failed!\n");
	exit(ENOENT);
    }

    largetable.load_text(gbfile);
    fseek(gbfile, 0L, SEEK_SET);
    phrase_index.load_text(1, gbfile);
    fclose(gbfile);

    FILE * gbkfile = fopen("../../data/gbk_char.table", "r");
    if (NULL == gbkfile) {
	fprintf(stderr, "open gbk_char.table failed!\n");
	exit(ENOENT);
    }

    largetable.load_text(gbkfile);
    fseek(gbkfile, 0L, SEEK_SET);
    phrase_index.load_text(2, gbkfile);
    fclose(gbkfile);

    MemoryChunk * new_chunk = new MemoryChunk;
    largetable.store(new_chunk);
    largetable.load(new_chunk);

    char* linebuf = NULL; size_t size = 0;
    while( getline(&linebuf, &size, stdin) ){
        linebuf[strlen(linebuf)-1] = '\0';
	if ( strcmp ( linebuf, "quit" ) == 0)
	    break;

        FullPinyinParser2 parser;
        ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
        ChewingKeyRestVector key_rests =
            g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));

        parser.parse(options, keys, key_rests, linebuf, strlen(linebuf));
        if (0 == keys->len) {
            fprintf(stderr, "Invalid input.\n");
            continue;
        }

        guint32 start = record_time();
        PhraseIndexRanges ranges;
        memset(ranges, 0, sizeof(PhraseIndexRanges));

        guint8 min_index, max_index;
        phrase_index.get_sub_phrase_range(min_index, max_index);

        for (size_t i = min_index; i < max_index; ++i) {
            ranges[i] = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange));
        }

        for (size_t i = 0; i < bench_times; ++i) {
            largetable.search(keys->len, (ChewingKey *)keys->data, ranges);
        }

        for (size_t i = min_index; i < max_index; ++i) {
            g_array_set_size(ranges[i], 0);
        }
        print_time(start, bench_times);

        largetable.search(keys->len, (ChewingKey *)keys->data, ranges);

        for (size_t i = min_index; i < max_index; ++i) {
            GArray * & range = ranges[i];
            if (range) {
                if (range->len)
                    printf("range items number:%d\n", range->len);

                for (size_t k = 0; k < range->len; ++k) {
                    PhraseIndexRange * onerange =
                        &g_array_index(range, PhraseIndexRange, k);
                    printf("start:%d\tend:%d\n", onerange->m_range_begin,
                           onerange->m_range_end);

		    PhraseItem item;
		    for ( phrase_token_t token = onerange->m_range_begin;
                          token != onerange->m_range_end; ++token){

			phrase_index.get_phrase_item( token, item);

                        /* get phrase string */
			gunichar2 buffer[MAX_PHRASE_LENGTH + 1];
			item.get_phrase_string(buffer);
			char * string = g_utf16_to_utf8
			    ( buffer, item.get_phrase_length(),
			      NULL, NULL, NULL);
			printf("%s\t", string);
			g_free(string);

                        ChewingKey chewing_buffer[MAX_PHRASE_LENGTH];
                        size_t npron = item.get_n_pronunciation();
                        guint32 freq;
                        for (size_t m = 0; m < npron; ++m){
                            item.get_nth_pronunciation(m, chewing_buffer, freq);
                            for (size_t n = 0; n < item.get_phrase_length();
                                  ++n){
                                printf("%s'",
                                       chewing_buffer[n].get_pinyin_string());
                            }
                            printf("\b\t%d\t", freq);
                        }
                    }
                    printf("\n");
                }
            }
            g_array_set_size(range, 0);
        }
	g_array_free(keys, TRUE);
	g_array_free(key_rests, TRUE);
    }

    if (linebuf)
        free(linebuf);
    return 0;
}
int main(int argc, char * argv[]) {
    SystemTableInfo system_table_info;

    bool retval = system_table_info.load("../../data/table.conf");
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    pinyin_option_t options = USE_TONE | PINYIN_INCOMPLETE;
    ChewingLargeTable largetable(options);
    FacadePhraseIndex phrase_index;

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_table_info();

    if (!load_phrase_table(phrase_files, &largetable, NULL, &phrase_index))
        exit(ENOENT);

    MemoryChunk * new_chunk = new MemoryChunk;
    largetable.store(new_chunk);
    largetable.load(new_chunk);

    char* linebuf = NULL; size_t size = 0; ssize_t read;
    while ((read = getline(&linebuf, &size, stdin)) != -1) {
        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

	if ( strcmp ( linebuf, "quit" ) == 0)
	    break;

        FullPinyinParser2 parser;
        ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
        ChewingKeyRestVector key_rests =
            g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));

        parser.parse(options, keys, key_rests, linebuf, strlen(linebuf));
        if (0 == keys->len) {
            fprintf(stderr, "Invalid input.\n");
            continue;
        }

        guint32 start = record_time();
        PhraseIndexRanges ranges;
        memset(ranges, 0, sizeof(PhraseIndexRanges));

        phrase_index.prepare_ranges(ranges);

        for (size_t i = 0; i < bench_times; ++i) {
            phrase_index.clear_ranges(ranges);
            largetable.search(keys->len, (ChewingKey *)keys->data, ranges);
        }
        print_time(start, bench_times);

        phrase_index.clear_ranges(ranges);
        largetable.search(keys->len, (ChewingKey *)keys->data, ranges);

        for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
            GArray * & range = ranges[i];
            if (!range)
                continue;

            if (range->len)
                printf("range items number:%d\n", range->len);

            for (size_t k = 0; k < range->len; ++k) {
                PhraseIndexRange * onerange =
                    &g_array_index(range, PhraseIndexRange, k);
                printf("start:%d\tend:%d\n", onerange->m_range_begin,
                       onerange->m_range_end);

                PhraseItem item;
                for ( phrase_token_t token = onerange->m_range_begin;
                      token != onerange->m_range_end; ++token){

                    phrase_index.get_phrase_item( token, item);

                    /* get phrase string */
                    ucs4_t buffer[MAX_PHRASE_LENGTH + 1];
                    item.get_phrase_string(buffer);
                    char * string = g_ucs4_to_utf8
                        ( buffer, item.get_phrase_length(),
                          NULL, NULL, NULL);
                    printf("%s\t", string);
                    g_free(string);

                    ChewingKey chewing_buffer[MAX_PHRASE_LENGTH];
                    size_t npron = item.get_n_pronunciation();
                    guint32 freq;
                    for (size_t m = 0; m < npron; ++m){
                        item.get_nth_pronunciation(m, chewing_buffer, freq);
                        for (size_t n = 0; n < item.get_phrase_length();
                             ++n){
                            gchar * pinyins =
                                chewing_buffer[n].get_pinyin_string();
                            printf("%s'", pinyins);
                            g_free(pinyins);
                        }
                        printf("\b\t%d\t", freq);
                    }
                }
                printf("\n");
            }
            g_array_set_size(range, 0);
        }

        phrase_index.destroy_ranges(ranges);
	g_array_free(keys, TRUE);
	g_array_free(key_rests, TRUE);
    }

    if (linebuf)
        free(linebuf);

    /* mask out all index items. */
    largetable.mask_out(0x0, 0x0);

    return 0;
}