int main(int argc, char * argv[]){ FILE * input = stdin; FILE * output = stdout; setlocale(LC_ALL, ""); GError * error = NULL; GOptionContext * context; context = g_option_context_new("- n-gram segment"); g_option_context_add_main_entries(context, entries, NULL); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } if (outputfile) { output = fopen(outputfile, "w"); if (NULL == output) { perror("open file failed"); exit(EINVAL); } } if (argc > 2) { fprintf(stderr, "too many arguments.\n"); exit(EINVAL); } if (2 == argc) { input = fopen(argv[1], "r"); if (NULL == input) { perror("open file failed"); exit(EINVAL); } } SystemTableInfo2 system_table_info; bool retval = system_table_info.load(SYSTEM_TABLE_INFO); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } /* init phrase table */ FacadePhraseTable3 phrase_table; phrase_table.load(SYSTEM_PHRASE_INDEX, NULL); /* init phrase index */ FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_default_tables(); if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); /* init bi-gram */ Bigram system_bigram; system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY); Bigram user_bigram; gfloat lambda = system_table_info.get_lambda(); /* init phrase lookup */ PhraseLookup phrase_lookup(lambda, &phrase_table, &phrase_index, &system_bigram, &user_bigram); CONTEXT_STATE state, next_state; GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index.prepare_tokens(tokens); /* split the sentence */ char * linebuf = NULL; size_t size = 0; ssize_t read; while( (read = getline(&linebuf, &size, input)) != -1 ){ if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } /* check non-ucs4 characters */ const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); if ( len != num_of_chars ) { fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf); fprintf(output, "%d \n", null_token); continue; } /* only new-line persists. */ if ( 0 == num_of_chars ) { fprintf(output, "%d \n", null_token); continue; } state = CONTEXT_INIT; int result = phrase_table.search( 1, sentence, tokens); g_array_append_val( current_ucs4, sentence[0]); if ( result & SEARCH_OK ) state = CONTEXT_SEGMENTABLE; else state = CONTEXT_UNKNOWN; for ( int i = 1; i < num_of_chars; ++i) { int result = phrase_table.search( 1, sentence + i, tokens); if ( result & SEARCH_OK ) next_state = CONTEXT_SEGMENTABLE; else next_state = CONTEXT_UNKNOWN; if ( state == next_state ){ g_array_append_val(current_ucs4, sentence[i]); continue; } assert ( state != next_state ); if ( state == CONTEXT_SEGMENTABLE ) deal_with_segmentable(&phrase_lookup, current_ucs4, output); if ( state == CONTEXT_UNKNOWN ) deal_with_unknown(current_ucs4, output); /* save the current character */ g_array_set_size(current_ucs4, 0); g_array_append_val(current_ucs4, sentence[i]); state = next_state; } if ( current_ucs4->len ) { /* this seems always true. */ if ( state == CONTEXT_SEGMENTABLE ) deal_with_segmentable(&phrase_lookup, current_ucs4, output); if ( state == CONTEXT_UNKNOWN ) deal_with_unknown(current_ucs4, output); g_array_set_size(current_ucs4, 0); } /* print extra enter */ if ( gen_extra_enter ) fprintf(output, "%d \n", null_token); g_free(sentence); } phrase_index.destroy_tokens(tokens); /* print enter at file tail */ fprintf(output, "%d \n", null_token); g_array_free(current_ucs4, TRUE); free(linebuf); fclose(input); fclose(output); return 0; }
int main(int argc, char * argv[]){ int i = 1; bool gen_extra_enter = false; setlocale(LC_ALL, ""); //deal with options. while ( i < argc ){ if ( strcmp ("--help", argv[i]) == 0 ){ print_help(); exit(0); } else if ( strcmp("--generate-extra-enter", argv[i]) == 0 ){ gen_extra_enter = true; } else { print_help(); exit(EINVAL); } ++i; } //init phrase table PhraseLargeTable phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk); //init phrase index FacadePhraseIndex phrase_index; chunk = new MemoryChunk; chunk->load("gb_char.bin"); phrase_index.load(1, chunk); chunk = new MemoryChunk; chunk->load("gbk_char.bin"); phrase_index.load(2, chunk); //init bi-gram Bigram system_bigram; system_bigram.attach("bigram.db", ATTACH_READONLY); Bigram user_bigram; //init phrase lookup g_phrase_lookup = new PhraseLookup(&phrase_table, &phrase_index, &system_bigram, &user_bigram); CONTEXT_STATE state, next_state; GArray * current_utf16 = g_array_new(TRUE, TRUE, sizeof(utf16_t)); phrase_token_t token = null_token; //split the sentence char * linebuf = NULL; size_t size = 0; ssize_t read; while( (read = getline(&linebuf, &size, stdin)) != -1 ){ if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } //check non-ucs2 characters const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; utf16_t * sentence = g_utf8_to_utf16(linebuf, -1, NULL, &len, NULL); if ( len != num_of_chars ) { fprintf(stderr, "non-ucs2 characters encountered:%s.\n", linebuf); printf("\n"); continue; } /* only new-line persists. */ if ( 0 == num_of_chars ) { printf("\n"); continue; } state = CONTEXT_INIT; bool result = phrase_table.search( 1, sentence, token); g_array_append_val( current_utf16, sentence[0]); if ( result & SEARCH_OK ) state = CONTEXT_SEGMENTABLE; else state = CONTEXT_UNKNOWN; for ( int i = 1; i < num_of_chars; ++i) { bool result = phrase_table.search( 1, sentence + i, token); if ( result & SEARCH_OK ) next_state = CONTEXT_SEGMENTABLE; else next_state = CONTEXT_UNKNOWN; if ( state == next_state ){ g_array_append_val(current_utf16, sentence[i]); continue; } assert ( state != next_state ); if ( state == CONTEXT_SEGMENTABLE ) deal_with_segmentable(current_utf16); if ( state == CONTEXT_UNKNOWN ) deal_with_unknown(current_utf16); /* save the current character */ g_array_set_size(current_utf16, 0); g_array_append_val(current_utf16, sentence[i]); state = next_state; } if ( current_utf16->len ) { /* this seems always true. */ if ( state == CONTEXT_SEGMENTABLE ) deal_with_segmentable(current_utf16); if ( state == CONTEXT_UNKNOWN ) deal_with_unknown(current_utf16); g_array_set_size(current_utf16, 0); } /* print extra enter */ if ( gen_extra_enter ) printf("\n"); } delete g_phrase_lookup; g_phrase_lookup = NULL; /* print enter at file tail */ printf("\n"); g_array_free(current_utf16, TRUE); free(linebuf); return 0; }