示例#1
0
int main(int argc, char * argv[]){
    FILE * input = stdin;
    FILE * output = stdout;

    setlocale(LC_ALL, "");

    GError * error = NULL;
    GOptionContext * context;

    context = g_option_context_new("- n-gram segment");
    g_option_context_add_main_entries(context, entries, NULL);
    if (!g_option_context_parse(context, &argc, &argv, &error)) {
        g_print("option parsing failed:%s\n", error->message);
        exit(EINVAL);
    }

    if (outputfile) {
        output = fopen(outputfile, "w");
        if (NULL == output) {
            perror("open file failed");
            exit(EINVAL);
        }
    }

    if (argc > 2) {
        fprintf(stderr, "too many arguments.\n");
        exit(EINVAL);
    }

    if (2 == argc) {
        input = fopen(argv[1], "r");
        if (NULL == input) {
            perror("open file failed");
            exit(EINVAL);
        }
    }

    SystemTableInfo2 system_table_info;

    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    /* init phrase table */
    FacadePhraseTable3 phrase_table;
    phrase_table.load(SYSTEM_PHRASE_INDEX, NULL);

    /* init phrase index */
    FacadePhraseIndex phrase_index;

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_default_tables();

    if (!load_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);

    /* init bi-gram */
    Bigram system_bigram;
    system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY);
    Bigram user_bigram;

    gfloat lambda = system_table_info.get_lambda();

    /* init phrase lookup */
    PhraseLookup phrase_lookup(lambda,
                               &phrase_table, &phrase_index,
                               &system_bigram, &user_bigram);


    CONTEXT_STATE state, next_state;
    GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));

    PhraseTokens tokens;
    memset(tokens, 0, sizeof(PhraseTokens));
    phrase_index.prepare_tokens(tokens);

    /* split the sentence */
    char * linebuf = NULL; size_t size = 0; ssize_t read;
    while( (read = getline(&linebuf, &size, input)) != -1 ){
        if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        /* check non-ucs4 characters */
        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
        glong len = 0;
        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
        if ( len != num_of_chars ) {
            fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
            fprintf(output, "%d \n", null_token);
            continue;
        }

        /* only new-line persists. */
        if ( 0  == num_of_chars ) {
            fprintf(output, "%d \n", null_token);
            continue;
        }

        state = CONTEXT_INIT;
        int result = phrase_table.search( 1, sentence, tokens);
        g_array_append_val( current_ucs4, sentence[0]);
        if ( result & SEARCH_OK )
            state = CONTEXT_SEGMENTABLE;
        else
            state = CONTEXT_UNKNOWN;

        for ( int i = 1; i < num_of_chars; ++i) {
            int result = phrase_table.search( 1, sentence + i, tokens);
            if ( result & SEARCH_OK )
                next_state = CONTEXT_SEGMENTABLE;
            else
                next_state = CONTEXT_UNKNOWN;

            if ( state == next_state ){
                g_array_append_val(current_ucs4, sentence[i]);
                continue;
            }

            assert ( state != next_state );
            if ( state == CONTEXT_SEGMENTABLE )
                deal_with_segmentable(&phrase_lookup, current_ucs4, output);

            if ( state == CONTEXT_UNKNOWN )
                deal_with_unknown(current_ucs4, output);

            /* save the current character */
            g_array_set_size(current_ucs4, 0);
            g_array_append_val(current_ucs4, sentence[i]);
            state = next_state;
        }

        if ( current_ucs4->len ) {
            /* this seems always true. */
            if ( state == CONTEXT_SEGMENTABLE )
                deal_with_segmentable(&phrase_lookup, current_ucs4, output);

            if ( state == CONTEXT_UNKNOWN )
                deal_with_unknown(current_ucs4, output);
            g_array_set_size(current_ucs4, 0);
        }

        /* print extra enter */
        if ( gen_extra_enter )
            fprintf(output, "%d \n", null_token);

        g_free(sentence);
    }
    phrase_index.destroy_tokens(tokens);

    /* print enter at file tail */
    fprintf(output, "%d \n", null_token);
    g_array_free(current_ucs4, TRUE);
    free(linebuf);
    fclose(input);
    fclose(output);
    return 0;
}
int main(int argc, char * argv[]){
    setlocale(LC_ALL, "");

    SystemTableInfo system_table_info;

    bool retval = system_table_info.load("../../data/table.conf");
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    /* init phrase table */
    FacadePhraseTable2 phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("../../data/phrase_index.bin");
    phrase_table.load(chunk, NULL);

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_table_info();

    /* init phrase index */
    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);

    /* init bi-gram */
    Bigram system_bigram;
    system_bigram.attach("../../data/bigram.db", ATTACH_READONLY);
    Bigram user_bigram;

    gfloat lambda = system_table_info.get_lambda();

    /* init phrase lookup */
    PhraseLookup phrase_lookup(lambda,
                               &phrase_table, &phrase_index,
                               &system_bigram, &user_bigram);

    /* try one sentence */
    char * linebuf = NULL;
    size_t size = 0;
    ssize_t read;
    while( (read = getline(&linebuf, &size, stdin)) != -1 ){
        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        if ( strcmp ( linebuf, "quit" ) == 0)
            break;

        /* check non-ucs4 characters */
        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
        glong len = 0;
        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
        if ( len != num_of_chars ) {
            fprintf(stderr, "non-ucs4 characters are not accepted.\n");
            g_free(sentence);
            continue;
        }

        try_phrase_lookup(&phrase_lookup, sentence, len);
        g_free(sentence);
    }

    free(linebuf);
    return 0;
}
示例#3
0
int main(int argc, char * argv[]){
    int i = 1;

    setlocale(LC_ALL, "");
    //deal with options.
    while ( i < argc ){
        if ( strcmp ("--help", argv[i]) == 0 ){
            print_help();
            exit(0);
        } else {
            print_help();
            exit(EINVAL);
        }
        ++i;
    }


    //init phrase table
    PhraseLargeTable phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("../../data/phrase_index.bin");
    phrase_table.load(chunk);

    //init phrase index
    FacadePhraseIndex phrase_index;
    chunk = new MemoryChunk;
    chunk->load("../../data/gb_char.bin");
    phrase_index.load(1, chunk);
    chunk = new MemoryChunk;
    chunk->load("../../data/gbk_char.bin");
    phrase_index.load(2, chunk);

    //init bi-gram
    Bigram system_bigram;
    system_bigram.attach("../../data/bigram.db", ATTACH_READONLY);
    Bigram user_bigram;

    //init phrase lookup
    PhraseLookup phrase_lookup(&phrase_table, &phrase_index,
                               &system_bigram, &user_bigram);

    //try one sentence
    char * linebuf = NULL;
    size_t size = 0;
    ssize_t read;
    while( (read = getline(&linebuf, &size, stdin)) != -1 ){
        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        if ( strcmp ( linebuf, "quit" ) == 0)
            break;

        //check non-ucs2 characters
        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
        glong len = 0;
        utf16_t * sentence = g_utf8_to_utf16(linebuf, -1, NULL, &len, NULL);
        if ( len != num_of_chars ) {
            fprintf(stderr, "non-ucs2 characters are not accepted.\n");
            g_free(sentence);
            continue;
        }

        try_phrase_lookup(&phrase_lookup, sentence, len);
        g_free(sentence);
    }

    free(linebuf);
    return 0;
}
示例#4
0
int main(int argc, char * argv[]){
    int i = 1;
    bool gen_extra_enter = false;

    setlocale(LC_ALL, "");
    /* deal with options */
    while ( i < argc ){
        if ( strcmp ("--help", argv[i]) == 0 ){
            print_help();
            exit(0);
        } else if ( strcmp("--generate-extra-enter", argv[i]) == 0 ){
            gen_extra_enter = true;
        } else {
            print_help();
            exit(EINVAL);
        }
        ++i;
    }

    /* init phrase table */
    FacadePhraseTable2 phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("phrase_index.bin");
    phrase_table.load(chunk, NULL);

    /* init phrase index */
    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(&phrase_index))
        exit(ENOENT);

    /* init bi-gram */
    Bigram system_bigram;
    system_bigram.attach("bigram.db", ATTACH_READONLY);
    Bigram user_bigram;

    /* init phrase lookup */
    PhraseLookup phrase_lookup(&phrase_table, &phrase_index,
                               &system_bigram, &user_bigram);


    CONTEXT_STATE state, next_state;
    GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));

    PhraseTokens tokens;
    memset(tokens, 0, sizeof(PhraseTokens));
    phrase_index.prepare_tokens(tokens);

    /* split the sentence */
    char * linebuf = NULL; size_t size = 0; ssize_t read;
    while( (read = getline(&linebuf, &size, stdin)) != -1 ){
        if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        /* check non-ucs4 characters */
        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
        glong len = 0;
        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
        if ( len != num_of_chars ) {
            fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
            printf("\n");
            continue;
        }

        /* only new-line persists. */
        if ( 0  == num_of_chars ) {
            printf("\n");
            continue;
        }

        state = CONTEXT_INIT;
        int result = phrase_table.search( 1, sentence, tokens);
        g_array_append_val( current_ucs4, sentence[0]);
        if ( result & SEARCH_OK )
            state = CONTEXT_SEGMENTABLE;
        else
            state = CONTEXT_UNKNOWN;

        for ( int i = 1; i < num_of_chars; ++i) {
            int result = phrase_table.search( 1, sentence + i, tokens);
            if ( result & SEARCH_OK )
                next_state = CONTEXT_SEGMENTABLE;
            else
                next_state = CONTEXT_UNKNOWN;

            if ( state == next_state ){
                g_array_append_val(current_ucs4, sentence[i]);
                continue;
            }

            assert ( state != next_state );
            if ( state == CONTEXT_SEGMENTABLE )
                deal_with_segmentable(&phrase_lookup, current_ucs4);

            if ( state == CONTEXT_UNKNOWN )
                deal_with_unknown(current_ucs4);

            /* save the current character */
            g_array_set_size(current_ucs4, 0);
            g_array_append_val(current_ucs4, sentence[i]);
            state = next_state;
        }

        if ( current_ucs4->len ) {
            /* this seems always true. */
            if ( state == CONTEXT_SEGMENTABLE )
                deal_with_segmentable(&phrase_lookup, current_ucs4);

            if ( state == CONTEXT_UNKNOWN )
                deal_with_unknown(current_ucs4);
            g_array_set_size(current_ucs4, 0);
        }

        /* print extra enter */
        if ( gen_extra_enter )
            printf("\n");
    }
    phrase_index.destroy_tokens(tokens);

    /* print enter at file tail */
    printf("\n");
    g_array_free(current_ucs4, TRUE);
    free(linebuf);
    return 0;
}