int main( int argc, char * argv[]){
    SystemTableInfo system_table_info;

    bool retval = system_table_info.load("../../data/table.conf");
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    pinyin_option_t options = USE_TONE;
    FacadeChewingTable largetable;

    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("../../data/pinyin_index.bin");
    largetable.load(options, chunk, NULL);

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_table_info();

    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);

    Bigram system_bigram;
    system_bigram.attach("../../data/bigram.db", ATTACH_READONLY);
    Bigram user_bigram;
    user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE);

    gfloat lambda = system_table_info.get_lambda();
    
    PinyinLookup2 pinyin_lookup(lambda, options,
                                &largetable, &phrase_index,
                                &system_bigram, &user_bigram);

    /* prepare the prefixes for get_best_match. */
    TokenVector prefixes = g_array_new
        (FALSE, FALSE, sizeof(phrase_token_t));
    g_array_append_val(prefixes, sentence_start);
    
    CandidateConstraints constraints = g_array_new
        (TRUE, FALSE, sizeof(lookup_constraint_t));

    MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));

    char* linebuf = NULL; size_t size = 0; ssize_t read;
    while( (read = getline(&linebuf, &size, stdin)) != -1 ){
        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

	if ( strcmp ( linebuf, "quit" ) == 0)
	    break;
	
	FullPinyinParser2 parser;
	ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
	ChewingKeyRestVector key_rests =
            g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
	parser.parse(options, keys, key_rests, linebuf, strlen(linebuf));

	if ( 0 == keys->len ) /* invalid pinyin */
	    continue;

        /* initialize constraints. */
	g_array_set_size(constraints, keys->len);
	for ( size_t i = 0; i < constraints->len; ++i){
	    lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
	    constraint->m_type = NO_CONSTRAINT;
	}

	guint32 start_time = record_time();
	for ( size_t i = 0; i < bench_times; ++i)
	    pinyin_lookup.get_best_match(prefixes, keys, constraints, results);
	print_time(start_time, bench_times);
	for ( size_t i = 0; i < results->len; ++i){
	    phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
	    if ( null_token == *token)
		continue;
	    printf("pos:%ld,token:%d\t", i, *token);
	}
	printf("\n");
	char * sentence = NULL;
	pinyin_lookup.convert_to_utf8(results, sentence);
	printf("%s\n", sentence);

	g_array_free(keys, TRUE);
	g_array_free(key_rests, TRUE);
	g_free(sentence);
    }

    g_array_free(prefixes, TRUE);
    g_array_free(constraints, TRUE);
    g_array_free(results, TRUE);

    free(linebuf);
    return 0;
}
예제 #2
0
void feed_line(const char * phrase, const char * pinyin, const guint32 freq) {
    phrase_item * item = new phrase_item;
    item->length = g_utf8_strlen(phrase, -1);

    /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp
     *	where is the code which I don't want to touch. :-)
     */

    if (item->length >= MAX_PHRASE_LENGTH) {
        fprintf(stderr, "Too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq);
        delete item;
        return;
    }

    item->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);

    FullPinyinParser2 parser;
    ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
    ChewingKeyRestVector key_rests = g_array_new
        (FALSE, FALSE, sizeof(ChewingKeyRest));

    pinyin_option_t options = PINYIN_CORRECT_ALL | USE_TONE;
    parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
    assert(keys->len == key_rests->len);

    if (keys->len != item->length) {
        fprintf(stderr, "Invalid pinyin:%s\t%s\t%d\n", phrase, pinyin, freq);
        delete item;
        return;
    }

    GArray * array = (GArray *)g_tree_lookup(g_chewing_tree, item);

    chewing_and_freq_item value_item;
    value_item.keys = keys; value_item.key_rests = key_rests;
    value_item.freq = freq;

    assert(item->length == value_item.keys->len);
    if (NULL == array) {
        array = g_array_new(FALSE, FALSE, sizeof(chewing_and_freq_item));
        g_array_append_val(array, value_item);
        g_tree_insert(g_chewing_tree, item, array);
        return;
    }

    bool found = false;
    for (size_t i = 0; i < array->len; ++i) {
        chewing_and_freq_item * cur_item =
            &g_array_index(array, chewing_and_freq_item, i);
        int result = pinyin_exact_compare2
            ((ChewingKey *) value_item.keys->data,
             (ChewingKey *) cur_item->keys->data,
             value_item.keys->len);

        if (0 == result) {
            fprintf(stderr, "Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n",
                    phrase, pinyin, freq);
            cur_item->freq += freq;
            found = true;
        }
    }

    if (!found) {
        g_array_append_val(array, value_item);
        g_tree_insert(g_chewing_tree, item, array);
    } else {
        /* clean up */
        g_array_free(keys, TRUE);
        g_array_free(key_rests, TRUE);
    }

    delete item;
}
예제 #3
0
bool FacadePhraseIndex::load_text(guint8 phrase_index, FILE * infile){
    SubPhraseIndex * & sub_phrases = m_sub_phrase_indices[phrase_index];
    if ( !sub_phrases ){
	sub_phrases = new SubPhraseIndex;
    }

    char pinyin[256];
    char phrase[256];
    phrase_token_t token;
    size_t freq;
    PhraseItem * item_ptr = new PhraseItem;
    phrase_token_t cur_token = 0;
    while ( !feof(infile)){
        fscanf(infile, "%s", pinyin);
        fscanf(infile, "%s", phrase);
        fscanf(infile, "%u", &token);
	fscanf(infile, "%ld", &freq);
	if ( feof(infile) )
	    break;

        assert(PHRASE_INDEX_LIBRARY_INDEX(token) == phrase_index );

	glong written;
	utf16_t * phrase_utf16 = g_utf8_to_utf16(phrase, -1, NULL, 
					       &written, NULL);
	
	if ( 0 == cur_token ){
	    cur_token = token;
	    item_ptr->set_phrase_string(written, phrase_utf16);
	}

	if ( cur_token != token ){
	    add_phrase_item( cur_token, item_ptr);
	    delete item_ptr;
	    item_ptr = new PhraseItem;
	    cur_token = token;
	    item_ptr->set_phrase_string(written, phrase_utf16);
	}

        pinyin_option_t options = USE_TONE;
	FullPinyinParser2 parser;
	ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
	ChewingKeyRestVector key_rests =
            g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));

	parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
	
	if (item_ptr->get_phrase_length() == keys->len) {
            item_ptr->append_pronunciation((ChewingKey *)keys->data, freq);
        } else {
            fprintf(stderr, "FacadePhraseIndex::load_text:%s\t%s\n",
                    pinyin, phrase);
        }

	g_array_free(keys, TRUE);
	g_array_free(key_rests, TRUE);
	g_free(phrase_utf16);
    }

    add_phrase_item( cur_token, item_ptr);
    delete item_ptr;
    m_total_freq += m_sub_phrase_indices[phrase_index]->get_phrase_index_total_freq();
    return true;
}
예제 #4
0
int main(int argc, char * argv[]) {
    pinyin_option_t options = USE_TONE | PINYIN_INCOMPLETE;
    ChewingLargeTable largetable(options);
    FacadePhraseIndex phrase_index;

    FILE * gbfile = fopen("../../data/gb_char.table", "r");
    if (NULL == gbfile) {
	fprintf(stderr, "open gb_char.table failed!\n");
	exit(ENOENT);
    }

    largetable.load_text(gbfile);
    fseek(gbfile, 0L, SEEK_SET);
    phrase_index.load_text(1, gbfile);
    fclose(gbfile);

    FILE * gbkfile = fopen("../../data/gbk_char.table", "r");
    if (NULL == gbkfile) {
	fprintf(stderr, "open gbk_char.table failed!\n");
	exit(ENOENT);
    }

    largetable.load_text(gbkfile);
    fseek(gbkfile, 0L, SEEK_SET);
    phrase_index.load_text(2, gbkfile);
    fclose(gbkfile);

    MemoryChunk * new_chunk = new MemoryChunk;
    largetable.store(new_chunk);
    largetable.load(new_chunk);

    char* linebuf = NULL; size_t size = 0;
    while( getline(&linebuf, &size, stdin) ){
        linebuf[strlen(linebuf)-1] = '\0';
	if ( strcmp ( linebuf, "quit" ) == 0)
	    break;

        FullPinyinParser2 parser;
        ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
        ChewingKeyRestVector key_rests =
            g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));

        parser.parse(options, keys, key_rests, linebuf, strlen(linebuf));
        if (0 == keys->len) {
            fprintf(stderr, "Invalid input.\n");
            continue;
        }

        guint32 start = record_time();
        PhraseIndexRanges ranges;
        memset(ranges, 0, sizeof(PhraseIndexRanges));

        guint8 min_index, max_index;
        phrase_index.get_sub_phrase_range(min_index, max_index);

        for (size_t i = min_index; i < max_index; ++i) {
            ranges[i] = g_array_new(FALSE, FALSE, sizeof(PhraseIndexRange));
        }

        for (size_t i = 0; i < bench_times; ++i) {
            largetable.search(keys->len, (ChewingKey *)keys->data, ranges);
        }

        for (size_t i = min_index; i < max_index; ++i) {
            g_array_set_size(ranges[i], 0);
        }
        print_time(start, bench_times);

        largetable.search(keys->len, (ChewingKey *)keys->data, ranges);

        for (size_t i = min_index; i < max_index; ++i) {
            GArray * & range = ranges[i];
            if (range) {
                if (range->len)
                    printf("range items number:%d\n", range->len);

                for (size_t k = 0; k < range->len; ++k) {
                    PhraseIndexRange * onerange =
                        &g_array_index(range, PhraseIndexRange, k);
                    printf("start:%d\tend:%d\n", onerange->m_range_begin,
                           onerange->m_range_end);

		    PhraseItem item;
		    for ( phrase_token_t token = onerange->m_range_begin;
                          token != onerange->m_range_end; ++token){

			phrase_index.get_phrase_item( token, item);

                        /* get phrase string */
			gunichar2 buffer[MAX_PHRASE_LENGTH + 1];
			item.get_phrase_string(buffer);
			char * string = g_utf16_to_utf8
			    ( buffer, item.get_phrase_length(),
			      NULL, NULL, NULL);
			printf("%s\t", string);
			g_free(string);

                        ChewingKey chewing_buffer[MAX_PHRASE_LENGTH];
                        size_t npron = item.get_n_pronunciation();
                        guint32 freq;
                        for (size_t m = 0; m < npron; ++m){
                            item.get_nth_pronunciation(m, chewing_buffer, freq);
                            for (size_t n = 0; n < item.get_phrase_length();
                                  ++n){
                                printf("%s'",
                                       chewing_buffer[n].get_pinyin_string());
                            }
                            printf("\b\t%d\t", freq);
                        }
                    }
                    printf("\n");
                }
            }
            g_array_set_size(range, 0);
        }
	g_array_free(keys, TRUE);
	g_array_free(key_rests, TRUE);
    }

    if (linebuf)
        free(linebuf);
    return 0;
}
int main(int argc, char * argv[]) {
    SystemTableInfo system_table_info;

    bool retval = system_table_info.load("../../data/table.conf");
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    pinyin_option_t options = USE_TONE | PINYIN_INCOMPLETE;
    ChewingLargeTable largetable(options);
    FacadePhraseIndex phrase_index;

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_table_info();

    if (!load_phrase_table(phrase_files, &largetable, NULL, &phrase_index))
        exit(ENOENT);

    MemoryChunk * new_chunk = new MemoryChunk;
    largetable.store(new_chunk);
    largetable.load(new_chunk);

    char* linebuf = NULL; size_t size = 0; ssize_t read;
    while ((read = getline(&linebuf, &size, stdin)) != -1) {
        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

	if ( strcmp ( linebuf, "quit" ) == 0)
	    break;

        FullPinyinParser2 parser;
        ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
        ChewingKeyRestVector key_rests =
            g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));

        parser.parse(options, keys, key_rests, linebuf, strlen(linebuf));
        if (0 == keys->len) {
            fprintf(stderr, "Invalid input.\n");
            continue;
        }

        guint32 start = record_time();
        PhraseIndexRanges ranges;
        memset(ranges, 0, sizeof(PhraseIndexRanges));

        phrase_index.prepare_ranges(ranges);

        for (size_t i = 0; i < bench_times; ++i) {
            phrase_index.clear_ranges(ranges);
            largetable.search(keys->len, (ChewingKey *)keys->data, ranges);
        }
        print_time(start, bench_times);

        phrase_index.clear_ranges(ranges);
        largetable.search(keys->len, (ChewingKey *)keys->data, ranges);

        for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
            GArray * & range = ranges[i];
            if (!range)
                continue;

            if (range->len)
                printf("range items number:%d\n", range->len);

            for (size_t k = 0; k < range->len; ++k) {
                PhraseIndexRange * onerange =
                    &g_array_index(range, PhraseIndexRange, k);
                printf("start:%d\tend:%d\n", onerange->m_range_begin,
                       onerange->m_range_end);

                PhraseItem item;
                for ( phrase_token_t token = onerange->m_range_begin;
                      token != onerange->m_range_end; ++token){

                    phrase_index.get_phrase_item( token, item);

                    /* get phrase string */
                    ucs4_t buffer[MAX_PHRASE_LENGTH + 1];
                    item.get_phrase_string(buffer);
                    char * string = g_ucs4_to_utf8
                        ( buffer, item.get_phrase_length(),
                          NULL, NULL, NULL);
                    printf("%s\t", string);
                    g_free(string);

                    ChewingKey chewing_buffer[MAX_PHRASE_LENGTH];
                    size_t npron = item.get_n_pronunciation();
                    guint32 freq;
                    for (size_t m = 0; m < npron; ++m){
                        item.get_nth_pronunciation(m, chewing_buffer, freq);
                        for (size_t n = 0; n < item.get_phrase_length();
                             ++n){
                            gchar * pinyins =
                                chewing_buffer[n].get_pinyin_string();
                            printf("%s'", pinyins);
                            g_free(pinyins);
                        }
                        printf("\b\t%d\t", freq);
                    }
                }
                printf("\n");
            }
            g_array_set_size(range, 0);
        }

        phrase_index.destroy_ranges(ranges);
	g_array_free(keys, TRUE);
	g_array_free(key_rests, TRUE);
    }

    if (linebuf)
        free(linebuf);

    /* mask out all index items. */
    largetable.mask_out(0x0, 0x0);

    return 0;
}