int main(int argc, char * argv[]){
    FILE * output = stdout;
    const char * bigram_filename = "bigram.db";

    FacadePhraseIndex phrase_index;

    //gb_char binary file
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("gb_char.bin");
    phrase_index.load(1, chunk);

    //gbk_char binary file
    chunk = new MemoryChunk;
    chunk->load("gbk_char.bin");
    phrase_index.load(2, chunk);

    Bigram bigram;
    bigram.attach(bigram_filename, ATTACH_READONLY);

    begin_data(output);

    gen_unigram(output, &phrase_index);
    gen_bigram(output, &phrase_index, &bigram);

    end_data(output);
    return 0;
}
Beispiel #2
0
int main(int argc, char * argv[]){

    FacadePhraseIndex phrase_index;
    
    /* gb_char binary file */
    MemoryChunk * chunk = new MemoryChunk;
    bool retval = chunk->load("gb_char.bin");
    if (!retval) {
        fprintf(stderr, "open gb_char.bin failed!\n");
        exit(ENOENT);
    }
    phrase_index.load(1, chunk);
    
    /* gbk_char binary file */
    chunk = new MemoryChunk;
    retval = chunk->load("gbk_char.bin");
    if (!retval) {
        fprintf(stderr, "open gbk_char.bin failed!\n");
        exit(ENOENT);
    }
    phrase_index.load(2, chunk);

    /* Note: please increase the value when corpus size becomes larger.
     *  To avoid zero value when computing unigram frequency in float format.
     */
    guint32 freq = 1; PhraseIndexRange range;
    int result = phrase_index.get_range(1, range);
    if ( result == ERROR_OK ) {
        for ( size_t i = range.m_range_begin; i <= range.m_range_end; ++i ) {
            phrase_index.add_unigram_frequency(i, freq);
        }
    }

#if 1
    result = phrase_index.get_range(2, range);
    if ( result == ERROR_OK ) {
        for ( size_t i = range.m_range_begin; i <= range.m_range_end; ++i ) {
            phrase_index.add_unigram_frequency(i, freq);
        }
    }
#endif

    MemoryChunk * new_chunk = new MemoryChunk;
    phrase_index.store(1, new_chunk);
    new_chunk->save("gb_char.bin");
    phrase_index.load(1, new_chunk);

    new_chunk = new MemoryChunk;
    phrase_index.store(2, new_chunk);
    new_chunk->save("gbk_char.bin");
    phrase_index.load(2, new_chunk);

    return 0;
}
int main(int argc, char * argv[]){
    FacadePhraseIndex phrase_index;
    
    //gb_char binary file
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("gb_char.bin");
    phrase_index.load(1, chunk);
    
    //gbk_char binary file
    chunk = new MemoryChunk;
    chunk->load("gbk_char.bin");
    phrase_index.load(2, chunk);

    Bigram bigram;
    bigram.attach("bigram.db", ATTACH_READONLY);

    Bigram deleted_bigram;
    deleted_bigram.attach("deleted_bigram.db", ATTACH_READONLY);

    GArray * deleted_items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
    deleted_bigram.get_all_items(deleted_items);

    parameter_t lambda_sum = 0;
    int lambda_count = 0;

    for ( int i = 0; i < deleted_items->len; ++i ){
	phrase_token_t * token = &g_array_index(deleted_items, phrase_token_t, i);
	SingleGram * single_gram = NULL;
	bigram.load(*token, single_gram);

	SingleGram * deleted_single_gram = NULL;
	deleted_bigram.load(*token, deleted_single_gram);
	
	parameter_t lambda = compute_interpolation(deleted_single_gram, &phrase_index, single_gram);
	
	printf("token:%d lambda:%f\n", *token, lambda);

	lambda_sum += lambda;
	lambda_count ++;

	if (single_gram)
            delete single_gram;
	delete deleted_single_gram;
    }

    printf("average lambda:%f\n", (lambda_sum/lambda_count));
    g_array_free(deleted_items, TRUE);
    return 0;
}
int main(int argc, char * argv[]){
    int i = 1;
    const char * k_mixture_model_filename = NULL;
    FILE * output = stdout;

    while ( i < argc ){
        if ( strcmp ("--help", argv[i]) == 0 ){
            print_help();
            exit(0);
        } else if ( strcmp ("--k-mixture-model-file", argv[i]) == 0 ){
            if ( ++i > argc ){
                print_help();
                exit(EINVAL);
            }
            k_mixture_model_filename = argv[i];
        } else {
            print_help();
            exit(EINVAL);
        }
        ++i;
    }

    FacadePhraseIndex phrase_index;

    //gb_char binary file
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("gb_char.bin");
    phrase_index.load(1, chunk);

    //gbk_char binary file
    chunk = new MemoryChunk;
    chunk->load("gbk_char.bin");
    phrase_index.load(2, chunk);

    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    bigram.attach(k_mixture_model_filename, ATTACH_READONLY);

    print_k_mixture_model_magic_header(output, &bigram);
    print_k_mixture_model_array_headers(output, &bigram, &phrase_index);
    print_k_mixture_model_array_items(output, &bigram, &phrase_index);

    end_data(output);

    return 0;
}
int main( int argc, char * argv[]){
    SystemTableInfo system_table_info;

    bool retval = system_table_info.load("../../data/table.conf");
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    pinyin_option_t options = USE_TONE;
    FacadeChewingTable largetable;

    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("../../data/pinyin_index.bin");
    largetable.load(options, chunk, NULL);

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_table_info();

    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);

    Bigram system_bigram;
    system_bigram.attach("../../data/bigram.db", ATTACH_READONLY);
    Bigram user_bigram;
    user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE);

    gfloat lambda = system_table_info.get_lambda();
    
    PinyinLookup2 pinyin_lookup(lambda, options,
                                &largetable, &phrase_index,
                                &system_bigram, &user_bigram);

    /* prepare the prefixes for get_best_match. */
    TokenVector prefixes = g_array_new
        (FALSE, FALSE, sizeof(phrase_token_t));
    g_array_append_val(prefixes, sentence_start);
    
    CandidateConstraints constraints = g_array_new
        (TRUE, FALSE, sizeof(lookup_constraint_t));

    MatchResults results = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));

    char* linebuf = NULL; size_t size = 0; ssize_t read;
    while( (read = getline(&linebuf, &size, stdin)) != -1 ){
        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

	if ( strcmp ( linebuf, "quit" ) == 0)
	    break;
	
	FullPinyinParser2 parser;
	ChewingKeyVector keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
	ChewingKeyRestVector key_rests =
            g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
	parser.parse(options, keys, key_rests, linebuf, strlen(linebuf));

	if ( 0 == keys->len ) /* invalid pinyin */
	    continue;

        /* initialize constraints. */
	g_array_set_size(constraints, keys->len);
	for ( size_t i = 0; i < constraints->len; ++i){
	    lookup_constraint_t * constraint = &g_array_index(constraints, lookup_constraint_t, i);
	    constraint->m_type = NO_CONSTRAINT;
	}

	guint32 start_time = record_time();
	for ( size_t i = 0; i < bench_times; ++i)
	    pinyin_lookup.get_best_match(prefixes, keys, constraints, results);
	print_time(start_time, bench_times);
	for ( size_t i = 0; i < results->len; ++i){
	    phrase_token_t * token = &g_array_index(results, phrase_token_t, i);
	    if ( null_token == *token)
		continue;
	    printf("pos:%ld,token:%d\t", i, *token);
	}
	printf("\n");
	char * sentence = NULL;
	pinyin_lookup.convert_to_utf8(results, sentence);
	printf("%s\n", sentence);

	g_array_free(keys, TRUE);
	g_array_free(key_rests, TRUE);
	g_free(sentence);
    }

    g_array_free(prefixes, TRUE);
    g_array_free(constraints, TRUE);
    g_array_free(results, TRUE);

    free(linebuf);
    return 0;
}
int main(int argc, char * argv[]){
    FILE * input = stdin;
    const char * bigram_filename = "bigram.db";

    PhraseLargeTable phrases;

    MemoryChunk * chunk = new MemoryChunk;
    bool retval = chunk->load("phrase_index.bin");
    if (!retval) {
        fprintf(stderr, "open phrase_index.bin failed!\n");
        exit(ENOENT);
    }
    phrases.load(chunk);

    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(&phrase_index))
        exit(ENOENT);

    Bigram bigram;
    retval = bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
    if (!retval) {
        fprintf(stderr, "open %s failed!\n", bigram_filename);
        exit(ENOENT);
    }

    taglib_init();

    values = g_ptr_array_new();
    required = g_hash_table_new(g_str_hash, g_str_equal);

    //enter "\data" line
    assert(taglib_add_tag(BEGIN_LINE, "\\data", 0, "model", ""));
    ssize_t result = my_getline(input);
    if ( result == -1 ) {
        fprintf(stderr, "empty file input.\n");
        exit(ENODATA);
    }

    //read "\data" line
    if ( !taglib_read(linebuf, line_type, values, required) ) {
        fprintf(stderr, "error: interpolation model expected.\n");
        exit(ENODATA);
    }

    assert(line_type == BEGIN_LINE);
    char * value = NULL;
    assert(g_hash_table_lookup_extended(required, "model", NULL, (gpointer *)&value));
    if ( !( strcmp("interpolation", value) == 0 ) ) {
        fprintf(stderr, "error: interpolation model expected.\n");
        exit(ENODATA);
    }

    result = my_getline(input);
    if ( result != -1 )
        parse_body(input, &phrases, &phrase_index, &bigram);

    taglib_fini();

    if (!save_phrase_index(&phrase_index))
        exit(ENOENT);

    return 0;
}
int main(int argc, char * argv[]){
  MemoryChunk* chunk;
  chunk = new MemoryChunk();
  int i = 12;
  chunk->set_content(0, &i, sizeof(int));

  int * p = (int *)chunk->begin();
  assert(chunk->size() == sizeof(int));
  std::cout<<*p<<std::endl;
  std::cout<<chunk->capacity()<<std::endl;
  p = & i;
  chunk->set_chunk(p, sizeof(int), NULL);
  short t = 5;
  chunk->set_content(sizeof(int), &t, sizeof(short));
  assert( sizeof(int) + sizeof(short) == chunk->size());
  std::cout<<chunk->capacity()<<std::endl;

  p = (int *)chunk->begin();
  short * p2 =(short *)(((char *) (chunk->begin())) + sizeof(int));
  std::cout<<*p<<'\t'<<*p2<<std::endl;

  chunk->set_content(sizeof(int) + sizeof(short), &t, sizeof(short));
  
  assert( sizeof(int) + (sizeof(short) << 1) == chunk->size());
  std::cout<<chunk->capacity()<<std::endl;
  p = (int *)chunk->begin();
  p2 =(short *)(((char *) (chunk->begin())) + sizeof(int));
  std::cout<<*p<<'\t'<<*p2<<'\t'<<*(p2 + 1)<<std::endl;

  chunk->set_size(sizeof(int) + sizeof(short) *3);
  p = (int *)chunk->begin();
  p2 =(short *)(((char *) (chunk->begin())) + sizeof(int));

  chunk->set_content(0, &i, sizeof(int));

  *(p2+2) = 3;
  std::cout<<*p<<'\t'<<*p2<<'\t'<<*(p2 + 1)<<'\t'<<*(p2+2)<<std::endl;

  int m = 10;
  chunk->set_chunk(&m, sizeof(int), NULL);
  int n = 12;
  chunk->insert_content(sizeof(int), &n, sizeof(int));
  n = 11;
  chunk->insert_content(sizeof(int), &n, sizeof(int));

  int * p3 = (int *)chunk->begin();
  std::cout<<*p3<<'\t'<<*(p3+1)<<'\t'<<*(p3+2)<<std::endl;
	
  chunk->remove_content(sizeof(int), sizeof(int));
  std::cout<<*p3<<'\t'<<*(p3+1)<<std::endl;

  int tmp;
  assert(chunk->get_content(sizeof(int), &tmp, sizeof(int)));
  std::cout<<tmp<<std::endl;
  
  
  delete chunk;

  const char * filename =  "/tmp/version";
  const char * version = "0.2.0";

  chunk =  new MemoryChunk;
  bool retval = chunk->load(filename);
  if ( !retval ){
      std::cerr<<"can't find chunk"<<std::endl;
  }else{
      if ( memcmp(version, chunk->begin(), strlen(version) + 1) == 0){
	  std::cout<<"match"<<std::endl;
      }

  }

  chunk->set_content(0, version, strlen(version) + 1);
  chunk->save(filename);

  retval = chunk->load(filename);
  if ( !retval ){
      std::cerr<<"can't find chunk"<<std::endl;
  }
  if ( memcmp(version, chunk->begin(), strlen(version) + 1) == 0){
      std::cout<<"match"<<std::endl;
  }

  return 0;
}
Beispiel #8
0
int main(int argc, char * argv[]){
    int i = 1;
    bool gen_extra_enter = false;

    setlocale(LC_ALL, "");
    //deal with options.
    while ( i < argc ){
        if ( strcmp ("--help", argv[i]) == 0 ){
            print_help();
            exit(0);
        } else if ( strcmp("--generate-extra-enter", argv[i]) == 0 ){
            gen_extra_enter = true;
        } else {
            print_help();
            exit(EINVAL);
        }
        ++i;
    }

    //init phrase table
    PhraseLargeTable phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("phrase_index.bin");
    phrase_table.load(chunk);

    //init phrase index
    FacadePhraseIndex phrase_index;
    chunk = new MemoryChunk;
    chunk->load("gb_char.bin");
    phrase_index.load(1, chunk);
    chunk = new MemoryChunk;
    chunk->load("gbk_char.bin");
    phrase_index.load(2, chunk);

    //init bi-gram
    Bigram system_bigram;
    system_bigram.attach("bigram.db", ATTACH_READONLY);
    Bigram user_bigram;

    //init phrase lookup
    g_phrase_lookup = new PhraseLookup(&phrase_table, &phrase_index,
                                       &system_bigram, &user_bigram);


    CONTEXT_STATE state, next_state;
    GArray * current_utf16 = g_array_new(TRUE, TRUE, sizeof(utf16_t));
    phrase_token_t token = null_token;

    //split the sentence
    char * linebuf = NULL;
    size_t size = 0;
    ssize_t read;
    while( (read = getline(&linebuf, &size, stdin)) != -1 ){
        if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        //check non-ucs2 characters
        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
        glong len = 0;
        utf16_t * sentence = g_utf8_to_utf16(linebuf, -1, NULL, &len, NULL);
        if ( len != num_of_chars ) {
            fprintf(stderr, "non-ucs2 characters encountered:%s.\n", linebuf);
            printf("\n");
            continue;
        }

        /* only new-line persists. */
        if ( 0  == num_of_chars ) {
            printf("\n");
            continue;
        }

        state = CONTEXT_INIT;
        bool result = phrase_table.search( 1, sentence, token);
        g_array_append_val( current_utf16, sentence[0]);
        if ( result & SEARCH_OK )
            state = CONTEXT_SEGMENTABLE;
        else
            state = CONTEXT_UNKNOWN;

        for ( int i = 1; i < num_of_chars; ++i) {
            bool result = phrase_table.search( 1, sentence + i, token);
            if ( result & SEARCH_OK )
                next_state = CONTEXT_SEGMENTABLE;
            else
                next_state = CONTEXT_UNKNOWN;

            if ( state == next_state ){
                g_array_append_val(current_utf16, sentence[i]);
                continue;
            }

            assert ( state != next_state );
            if ( state == CONTEXT_SEGMENTABLE )
                deal_with_segmentable(current_utf16);

            if ( state == CONTEXT_UNKNOWN )
                deal_with_unknown(current_utf16);

            /* save the current character */
            g_array_set_size(current_utf16, 0);
            g_array_append_val(current_utf16, sentence[i]);
            state = next_state;
        }

        if ( current_utf16->len ) {
            /* this seems always true. */
            if ( state == CONTEXT_SEGMENTABLE )
                deal_with_segmentable(current_utf16);

            if ( state == CONTEXT_UNKNOWN )
                deal_with_unknown(current_utf16);
            g_array_set_size(current_utf16, 0);
        }

        /* print extra enter */
        if ( gen_extra_enter )
            printf("\n");
    }

    delete g_phrase_lookup;
    g_phrase_lookup = NULL;
    /* print enter at file tail */
    printf("\n");
    g_array_free(current_utf16, TRUE);
    free(linebuf);
    return 0;
}
int main(int argc, char * argv[]){
    const char * evals_text = "evals.text";

    pinyin_option_t options = USE_TONE;
    FacadeChewingTable largetable;

    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("pinyin_index.bin");
    largetable.load(options, chunk, NULL);

    FacadePhraseTable2 phrase_table;
    chunk = new MemoryChunk;
    chunk->load("phrase_index.bin");
    phrase_table.load(chunk, NULL);

    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(&phrase_index))
        exit(ENOENT);

    Bigram system_bigram;
    system_bigram.attach("bigram.db", ATTACH_READONLY);
    Bigram user_bigram;
    user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE);

    PinyinLookup2 pinyin_lookup(options, &largetable, &phrase_index,
                               &system_bigram, &user_bigram);

    /* open evals.text. */
    FILE * evals_file = fopen(evals_text, "r");
    if ( NULL == evals_file ) {
        fprintf(stderr, "Can't open file:%s\n", evals_text);
        exit(ENOENT);
    }

    PhraseTokens phrase_tokens;
    memset(phrase_tokens, 0, sizeof(PhraseTokens));
    phrase_index.prepare_tokens(phrase_tokens);

    /* Evaluates the correction rate of test text documents. */
    size_t tested_count = 0; size_t passed_count = 0;
    char* linebuf = NULL; size_t size = 0;
    TokenVector tokens = g_array_new(FALSE, TRUE, sizeof(phrase_token_t));

    phrase_token_t token = null_token;
    while( getline(&linebuf, &size, evals_file) ) {
        if ( feof(evals_file) )
            break;
        if ( '\n' == linebuf[strlen(linebuf)-1] )
            linebuf[strlen(linebuf)-1] = '\0';

        glong phrase_len = 0;
        ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL);

        token = null_token;
        if ( 0 != phrase_len ) {
            int result = phrase_table.search(phrase_len, phrase, phrase_tokens);
            int num = get_first_token(phrase_tokens, token);

            if ( !(result & SEARCH_OK) )
                token = null_token;

            g_free(phrase);
            phrase = NULL;
        }

        if ( null_token == token ) {
            if ( tokens->len ) { /* one test. */
                if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) {
                    tested_count ++; passed_count ++;
                } else {
                    tested_count ++;
                }
                g_array_set_size(tokens, 0);
            }
        } else {
            g_array_append_val(tokens, token);
        }
    }

    if ( tokens->len ) { /* one test. */
        if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) {
            tested_count ++; passed_count ++;
        } else {
            tested_count ++;
        }
    }

    parameter_t rate = passed_count / (parameter_t) tested_count;
    printf("correction rate:%f\n", rate);

    g_array_free(tokens, TRUE);
    fclose(evals_file);
    free(linebuf);

    phrase_index.destroy_tokens(phrase_tokens);

    return 0;
}
Beispiel #10
0
int main(int argc, char * argv[]){
    int i = 1;
    bool train_pi_gram = true;
    const char * bigram_filename = "bigram.db";

    setlocale(LC_ALL, "");
    while ( i < argc ){
	if ( strcmp("--help", argv[i]) == 0){
	    print_help();
            exit(0);
	}else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){
	    train_pi_gram = false;
	}else if ( strcmp("--bigram-file", argv[i]) == 0){
            if ( ++i >= argc ) {
                print_help();
                exit(EINVAL);
            }
            bigram_filename = argv[i];
	}else{
            print_help();
            exit(EINVAL);
        }
	++i;
    }
    
    g_phrases = new PhraseLargeTable;
    //init phrase lookup
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("phrase_index.bin");
    g_phrases->load(chunk);

    FacadePhraseIndex phrase_index;
    
    //gb_char binary file
    chunk = new MemoryChunk;
    chunk->load("gb_char.bin");
    phrase_index.load(1, chunk);
    
    //gbk_char binary file
    chunk = new MemoryChunk;
    chunk->load("gbk_char.bin");
    phrase_index.load(2, chunk);
    
    Bigram bigram;
    bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
    
    
    char* linebuf = NULL;
    size_t size = 0;
    phrase_token_t last_token, cur_token = last_token = 0;
    while( getline(&linebuf, &size, stdin) ){
	if ( feof(stdin) )
	    break;
        linebuf[strlen(linebuf)-1] = '\0';

        glong phrase_len = 0;
        utf16_t * phrase = g_utf8_to_utf16(linebuf, -1, NULL, &phrase_len, NULL);

	phrase_token_t token = 0;
        if ( 0 != phrase_len ) {
            int result = g_phrases->search( phrase_len, phrase, token);
            if ( ! (result & SEARCH_OK) )
                token = 0;
            g_free(phrase);
            phrase = NULL;
        }

	last_token = cur_token;
	cur_token = token;

        /* skip null_token in second word. */
        if ( null_token == cur_token )
            continue;

        //training uni-gram
        phrase_index.add_unigram_frequency(cur_token, 1);

        /* skip pi-gram training. */
        if ( null_token == last_token ){
            if ( !train_pi_gram )
                continue;
            last_token = sentence_start;
        }

        //train bi-gram
        SingleGram * single_gram = NULL;
        bigram.load(last_token, single_gram);

        if ( NULL == single_gram ){
            single_gram = new SingleGram;
        }
        guint32 freq, total_freq;
        //increase freq
        if (single_gram->get_freq(cur_token, freq))
            assert(single_gram->set_freq(cur_token, freq + 1));
        else
            assert(single_gram->insert_freq(cur_token, 1));
        //increase total freq
        single_gram->get_total_freq(total_freq);
        single_gram->set_total_freq(total_freq + 1);

        bigram.store(last_token, single_gram);
        delete single_gram;
    }

    free(linebuf);
    
    MemoryChunk * new_chunk = new MemoryChunk;
    phrase_index.store(1, new_chunk);
    new_chunk->save("gb_char.bin");
    phrase_index.load(1, new_chunk);

    new_chunk = new MemoryChunk;
    phrase_index.store(2, new_chunk);
    new_chunk->save("gbk_char.bin");
    phrase_index.load(2, new_chunk);

    return 0;
}
Beispiel #11
0
int main(int argc, char * argv[]){
    int i = 1;
    bool gen_extra_enter = false;

    setlocale(LC_ALL, "");
    //deal with options.
    while ( i < argc ){
        if ( strcmp ("--help", argv[i]) == 0) {
            print_help();
            exit(0);
        } else if (strcmp("--generate-extra-enter", argv[i]) == 0) {
            gen_extra_enter = true;
        } else {
            print_help();
            exit(EINVAL);
        }
        ++i;
    }

    /* init phrase table */
    FacadePhraseTable2 phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("phrase_index.bin");
    phrase_table.load(chunk, NULL);

    /* init phrase index */
    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(&phrase_index))
        exit(ENOENT);

    char * linebuf = NULL;
    size_t size = 0;
    ssize_t read;
    while( (read = getline(&linebuf, &size, stdin)) != -1 ){
        if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        //check non-ucs4 characters
        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
        glong len = 0;
        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
        if ( len != num_of_chars ) {
            fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
            printf("\n");
            continue;
        }

        //do segment stuff
        GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
        segment(&phrase_table, &phrase_index, sentence, len, strings);

        //print out the split phrase
        for ( glong i = 0; i < strings->len; ++i ) {
            SegmentStep * step = &g_array_index(strings, SegmentStep, i);
            char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL);
            printf("%d %s\n", step->m_handle, string);
            g_free(string);
        }

        /* print extra enter */
        if ( gen_extra_enter )
            printf("\n");

        g_array_free(strings, TRUE);
        g_free(sentence);
    }

    /* print enter at file tail */
    printf("\n");
    return 0;
}
int main(int argc, char * argv[]){
    int i = 1;

    setlocale(LC_ALL, "");
    //deal with options.
    while ( i < argc ){
        if ( strcmp ("--help", argv[i]) == 0 ){
            print_help();
            exit(0);
        } else {
            print_help();
            exit(EINVAL);
        }
        ++i;
    }


    //init phrase table
    PhraseLargeTable phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("../../data/phrase_index.bin");
    phrase_table.load(chunk);

    //init phrase index
    FacadePhraseIndex phrase_index;
    chunk = new MemoryChunk;
    chunk->load("../../data/gb_char.bin");
    phrase_index.load(1, chunk);
    chunk = new MemoryChunk;
    chunk->load("../../data/gbk_char.bin");
    phrase_index.load(2, chunk);

    //init bi-gram
    Bigram system_bigram;
    system_bigram.attach("../../data/bigram.db", ATTACH_READONLY);
    Bigram user_bigram;

    //init phrase lookup
    PhraseLookup phrase_lookup(&phrase_table, &phrase_index,
                               &system_bigram, &user_bigram);

    //try one sentence
    char * linebuf = NULL;
    size_t size = 0;
    ssize_t read;
    while( (read = getline(&linebuf, &size, stdin)) != -1 ){
        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        if ( strcmp ( linebuf, "quit" ) == 0)
            break;

        //check non-ucs2 characters
        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
        glong len = 0;
        utf16_t * sentence = g_utf8_to_utf16(linebuf, -1, NULL, &len, NULL);
        if ( len != num_of_chars ) {
            fprintf(stderr, "non-ucs2 characters are not accepted.\n");
            g_free(sentence);
            continue;
        }

        try_phrase_lookup(&phrase_lookup, sentence, len);
        g_free(sentence);
    }

    free(linebuf);
    return 0;
}
Beispiel #13
0
int main(int argc, char * argv[]){
    FILE * input = stdin;
    FILE * output = stdout;

    setlocale(LC_ALL, "");

    GError * error = NULL;
    GOptionContext * context;

    context = g_option_context_new("- n-gram segment");
    g_option_context_add_main_entries(context, entries, NULL);
    if (!g_option_context_parse(context, &argc, &argv, &error)) {
        g_print("option parsing failed:%s\n", error->message);
        exit(EINVAL);
    }

    if (outputfile) {
        output = fopen(outputfile, "w");
        if (NULL == output) {
            perror("open file failed");
            exit(EINVAL);
        }
    }

    if (argc > 2) {
        fprintf(stderr, "too many arguments.\n");
        exit(EINVAL);
    }

    if (2 == argc) {
        input = fopen(argv[1], "r");
        if (NULL == input) {
            perror("open file failed");
            exit(EINVAL);
        }
    }

    SystemTableInfo system_table_info;

    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    /* init phrase table */
    FacadePhraseTable2 phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load(SYSTEM_PHRASE_INDEX);
    phrase_table.load(chunk, NULL);

    /* init phrase index */
    FacadePhraseIndex phrase_index;

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_table_info();

    if (!load_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);

    /* init bi-gram */
    Bigram system_bigram;
    system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY);
    Bigram user_bigram;

    gfloat lambda = system_table_info.get_lambda();

    /* init phrase lookup */
    PhraseLookup phrase_lookup(lambda,
                               &phrase_table, &phrase_index,
                               &system_bigram, &user_bigram);


    CONTEXT_STATE state, next_state;
    GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));

    PhraseTokens tokens;
    memset(tokens, 0, sizeof(PhraseTokens));
    phrase_index.prepare_tokens(tokens);

    /* split the sentence */
    char * linebuf = NULL; size_t size = 0; ssize_t read;
    while( (read = getline(&linebuf, &size, input)) != -1 ){
        if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        /* check non-ucs4 characters */
        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
        glong len = 0;
        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
        if ( len != num_of_chars ) {
            fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
            fprintf(output, "%d \n", null_token);
            continue;
        }

        /* only new-line persists. */
        if ( 0  == num_of_chars ) {
            fprintf(output, "%d \n", null_token);
            continue;
        }

        state = CONTEXT_INIT;
        int result = phrase_table.search( 1, sentence, tokens);
        g_array_append_val( current_ucs4, sentence[0]);
        if ( result & SEARCH_OK )
            state = CONTEXT_SEGMENTABLE;
        else
            state = CONTEXT_UNKNOWN;

        for ( int i = 1; i < num_of_chars; ++i) {
            int result = phrase_table.search( 1, sentence + i, tokens);
            if ( result & SEARCH_OK )
                next_state = CONTEXT_SEGMENTABLE;
            else
                next_state = CONTEXT_UNKNOWN;

            if ( state == next_state ){
                g_array_append_val(current_ucs4, sentence[i]);
                continue;
            }

            assert ( state != next_state );
            if ( state == CONTEXT_SEGMENTABLE )
                deal_with_segmentable(&phrase_lookup, current_ucs4, output);

            if ( state == CONTEXT_UNKNOWN )
                deal_with_unknown(current_ucs4, output);

            /* save the current character */
            g_array_set_size(current_ucs4, 0);
            g_array_append_val(current_ucs4, sentence[i]);
            state = next_state;
        }

        if ( current_ucs4->len ) {
            /* this seems always true. */
            if ( state == CONTEXT_SEGMENTABLE )
                deal_with_segmentable(&phrase_lookup, current_ucs4, output);

            if ( state == CONTEXT_UNKNOWN )
                deal_with_unknown(current_ucs4, output);
            g_array_set_size(current_ucs4, 0);
        }

        /* print extra enter */
        if ( gen_extra_enter )
            fprintf(output, "%d \n", null_token);

        g_free(sentence);
    }
    phrase_index.destroy_tokens(tokens);

    /* print enter at file tail */
    fprintf(output, "%d \n", null_token);
    g_array_free(current_ucs4, TRUE);
    free(linebuf);
    fclose(input);
    fclose(output);
    return 0;
}
int main(int argc, char * argv[]){
    int i = 1;

    setlocale(LC_ALL, "");

    GError * error = NULL;
    GOptionContext * context;

    context = g_option_context_new("- generate k mixture model");
    g_option_context_add_main_entries(context, entries, NULL);
    if (!g_option_context_parse(context, &argc, &argv, &error)) {
        g_print("option parsing failed:%s\n", error->message);
        exit(EINVAL);
    }

    SystemTableInfo system_table_info;

    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    PhraseLargeTable2 phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load(SYSTEM_PHRASE_INDEX);
    phrase_table.load(chunk);

    FacadePhraseIndex phrase_index;

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_table_info();

    if (!load_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);

    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    bigram.attach(g_k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);

    while ( i < argc ){
        const char * filename = argv[i];
        FILE * document = fopen(filename, "r");
        if ( NULL == document ){
            int err_saved = errno;
            fprintf(stderr, "can't open file: %s.\n", filename);
            fprintf(stderr, "error:%s.\n", strerror(err_saved));
            exit(err_saved);
        }

        HashofDocument hash_of_document = g_hash_table_new
            (g_direct_hash, g_direct_equal);
        HashofUnigram hash_of_unigram = g_hash_table_new
            (g_direct_hash, g_direct_equal);

        assert(read_document(&phrase_table, &phrase_index, document,
                             hash_of_document, hash_of_unigram));
        fclose(document);
        document = NULL;

        GHashTableIter iter;
        gpointer key, value;

        /* train the document, and convert it to k mixture model. */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            phrase_token_t token1 = GPOINTER_TO_UINT(key);
            train_second_word(hash_of_unigram, &bigram,
                              hash_of_document, token1);
        }

        KMixtureModelMagicHeader magic_header;
        assert(bigram.get_magic_header(magic_header));
        magic_header.m_N ++;
        assert(bigram.set_magic_header(magic_header));

        post_processing_unigram(&bigram, hash_of_unigram);

        /* free resources of g_hash_of_document */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            HashofSecondWord second_word = (HashofSecondWord) value;
            g_hash_table_iter_steal(&iter);
            g_hash_table_unref(second_word);
        }
        g_hash_table_unref(hash_of_document);
        hash_of_document = NULL;

        g_hash_table_unref(hash_of_unigram);
        hash_of_unigram = NULL;

        ++i;
    }

    return 0;
}
Beispiel #15
0
int main(int argc, char * argv[]){
    int i = 1;
    bool train_pi_gram = true;
    const char * bigram_filename = "bigram.db";

    setlocale(LC_ALL, "");
    while ( i < argc ){
	if ( strcmp("--help", argv[i]) == 0){
	    print_help();
            exit(0);
	}else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){
	    train_pi_gram = false;
	}else if ( strcmp("--bigram-file", argv[i]) == 0){
            if ( ++i >= argc ) {
                print_help();
                exit(EINVAL);
            }
            bigram_filename = argv[i];
	}else{
            print_help();
            exit(EINVAL);
        }
	++i;
    }
    
    PhraseLargeTable2 phrase_table;
    /* init phrase table */
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("phrase_index.bin");
    phrase_table.load(chunk);

    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(&phrase_index))
        exit(ENOENT);
    
    Bigram bigram;
    bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);

    PhraseTokens tokens;
    memset(tokens, 0, sizeof(PhraseTokens));
    phrase_index.prepare_tokens(tokens);
    
    char* linebuf = NULL;
    size_t size = 0;
    phrase_token_t last_token, cur_token = last_token = 0;
    while( getline(&linebuf, &size, stdin) ){
	if ( feof(stdin) )
	    break;
        linebuf[strlen(linebuf)-1] = '\0';

        glong phrase_len = 0;
        ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL);

	phrase_token_t token = null_token;
        if ( 0 != phrase_len ) {
            phrase_index.clear_tokens(tokens);
            int result = phrase_table.search(phrase_len, phrase, tokens);
            int num = get_first_token(tokens, token);
            if ( !(result & SEARCH_OK) )
                token = null_token;
            g_free(phrase);
            phrase = NULL;
        }

	last_token = cur_token;
	cur_token = token;

        /* skip null_token in second word. */
        if ( null_token == cur_token )
            continue;

        /* training uni-gram */
        phrase_index.add_unigram_frequency(cur_token, 1);

        /* skip pi-gram training. */
        if ( null_token == last_token ){
            if ( !train_pi_gram )
                continue;
            last_token = sentence_start;
        }

        /* train bi-gram */
        SingleGram * single_gram = NULL;
        bigram.load(last_token, single_gram);

        if ( NULL == single_gram ){
            single_gram = new SingleGram;
        }
        guint32 freq, total_freq;
        /* increase freq */
        if (single_gram->get_freq(cur_token, freq))
            assert(single_gram->set_freq(cur_token, freq + 1));
        else
            assert(single_gram->insert_freq(cur_token, 1));
        /* increase total freq */
        single_gram->get_total_freq(total_freq);
        single_gram->set_total_freq(total_freq + 1);

        bigram.store(last_token, single_gram);
        delete single_gram;
    }

    phrase_index.destroy_tokens(tokens);
    free(linebuf);
    
    if (!save_phrase_index(&phrase_index))
        exit(ENOENT);

    return 0;
}
int main(int argc, char * argv[]){
    int i = 1;
    const char * k_mixture_model_filename = NULL;

    setlocale(LC_ALL, "");
    while ( i < argc ){
        if ( strcmp("--help", argv[i]) == 0 ){
            print_help();
            exit(0);
        } else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){
            g_train_pi_gram = false;
        } else if ( strcmp("--maximum-occurs-allowed", argv[i]) == 0 ){
            if ( ++i >= argc ){
                print_help();
                exit(EINVAL);
            }
            g_maximum_occurs = atoi(argv[i]);
        } else if ( strcmp("--maximum-increase-rates-allowed", argv[i]) == 0 ){
            if ( ++i >= argc ){
                print_help();
                exit(EINVAL);
            }
            g_maximum_increase_rates = atof(argv[i]);
        } else if ( strcmp("--k-mixture-model-file", argv[i]) == 0 ){
            if ( ++i >= argc ){
                print_help();
                exit(EINVAL);
            }
            k_mixture_model_filename = argv[i];
        } else {
            break;
        }
        ++i;
    }

    PhraseLargeTable2 phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("phrase_index.bin");
    phrase_table.load(chunk);

    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(&phrase_index))
        exit(ENOENT);

    KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
    bigram.attach(k_mixture_model_filename, ATTACH_READWRITE|ATTACH_CREATE);

    while ( i < argc ){
        const char * filename = argv[i];
        FILE * document = fopen(filename, "r");
        if ( NULL == document ){
            int err_saved = errno;
            fprintf(stderr, "can't open file: %s.\n", filename);
            fprintf(stderr, "error:%s.\n", strerror(err_saved));
            exit(err_saved);
        }

        HashofDocument hash_of_document = g_hash_table_new
            (g_direct_hash, g_direct_equal);
        HashofUnigram hash_of_unigram = g_hash_table_new
            (g_direct_hash, g_direct_equal);

        assert(read_document(&phrase_table, &phrase_index, document,
                             hash_of_document, hash_of_unigram));
        fclose(document);
        document = NULL;

        GHashTableIter iter;
        gpointer key, value;

        /* train the document, and convert it to k mixture model. */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            phrase_token_t token1 = GPOINTER_TO_UINT(key);
            train_second_word(hash_of_unigram, &bigram,
                              hash_of_document, token1);
        }

        KMixtureModelMagicHeader magic_header;
        assert(bigram.get_magic_header(magic_header));
        magic_header.m_N ++;
        assert(bigram.set_magic_header(magic_header));

        post_processing_unigram(&bigram, hash_of_unigram);

        /* free resources of g_hash_of_document */
        g_hash_table_iter_init(&iter, hash_of_document);
        while (g_hash_table_iter_next(&iter, &key, &value)) {
            HashofSecondWord second_word = (HashofSecondWord) value;
            g_hash_table_iter_steal(&iter);
            g_hash_table_unref(second_word);
        }
        g_hash_table_unref(hash_of_document);
        hash_of_document = NULL;

        g_hash_table_unref(hash_of_unigram);
        hash_of_unigram = NULL;

        ++i;
    }

    return 0;
}
Beispiel #17
0
int main(int argc, char * argv[]){
    FILE * input = stdin;

    setlocale(LC_ALL, "");

    GError * error = NULL;
    GOptionContext * context;

    context = g_option_context_new("- generate n-gram");
    g_option_context_add_main_entries(context, entries, NULL);
    if (!g_option_context_parse(context, &argc, &argv, &error)) {
        g_print("option parsing failed:%s\n", error->message);
        exit(EINVAL);
    }

    SystemTableInfo system_table_info;

    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    PhraseLargeTable2 phrase_table;
    /* init phrase table */
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load(SYSTEM_PHRASE_INDEX);
    phrase_table.load(chunk);

    FacadePhraseIndex phrase_index;

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_table_info();

    if (!load_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);
    
    Bigram bigram;
    bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);

    char* linebuf = NULL; size_t size = 0;
    phrase_token_t last_token, cur_token = last_token = 0;
    while( getline(&linebuf, &size, input) ){
	if ( feof(input) )
	    break;

        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        TAGLIB_PARSE_SEGMENTED_LINE(&phrase_index, token, linebuf);

	last_token = cur_token;
	cur_token = token;

        /* skip null_token in second word. */
        if ( null_token == cur_token )
            continue;

        /* training uni-gram */
        phrase_index.add_unigram_frequency(cur_token, 1);

        /* skip pi-gram training. */
        if ( null_token == last_token ){
            if ( !train_pi_gram )
                continue;
            last_token = sentence_start;
        }

        /* train bi-gram */
        SingleGram * single_gram = NULL;
        bigram.load(last_token, single_gram);

        if ( NULL == single_gram ){
            single_gram = new SingleGram;
        }
        guint32 freq, total_freq;
        /* increase freq */
        if (single_gram->get_freq(cur_token, freq))
            assert(single_gram->set_freq(cur_token, freq + 1));
        else
            assert(single_gram->insert_freq(cur_token, 1));
        /* increase total freq */
        single_gram->get_total_freq(total_freq);
        single_gram->set_total_freq(total_freq + 1);

        bigram.store(last_token, single_gram);
        delete single_gram;
    }

    free(linebuf);
    
    if (!save_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);

    return 0;
}
int main(int argc, char * argv[]){
    setlocale(LC_ALL, "");

    SystemTableInfo system_table_info;

    bool retval = system_table_info.load("../../data/table.conf");
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    /* init phrase table */
    FacadePhraseTable2 phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("../../data/phrase_index.bin");
    phrase_table.load(chunk, NULL);

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_table_info();

    /* init phrase index */
    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);

    /* init bi-gram */
    Bigram system_bigram;
    system_bigram.attach("../../data/bigram.db", ATTACH_READONLY);
    Bigram user_bigram;

    gfloat lambda = system_table_info.get_lambda();

    /* init phrase lookup */
    PhraseLookup phrase_lookup(lambda,
                               &phrase_table, &phrase_index,
                               &system_bigram, &user_bigram);

    /* try one sentence */
    char * linebuf = NULL;
    size_t size = 0;
    ssize_t read;
    while( (read = getline(&linebuf, &size, stdin)) != -1 ){
        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        if ( strcmp ( linebuf, "quit" ) == 0)
            break;

        /* check non-ucs4 characters */
        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
        glong len = 0;
        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
        if ( len != num_of_chars ) {
            fprintf(stderr, "non-ucs4 characters are not accepted.\n");
            g_free(sentence);
            continue;
        }

        try_phrase_lookup(&phrase_lookup, sentence, len);
        g_free(sentence);
    }

    free(linebuf);
    return 0;
}