void output(const char * file_name) { FILE * fp = fopen(file_name, "wb"); if (!fp) { fprintf(stderr, _("Can not write file: %s\n"), file_name); exit(1); } size_t i, item_count; for (i = DATRIE_SIZE - 1; i > 0; i --) if (dat[i].parent != DATRIE_UNUSED) break; item_count = i + 1; size_t lexicon_length = lexicon[lexicon_count - 1].pos + ucs4len(lexicon[lexicon_count - 1].value) + 1; fwrite("OPENCCDATRIE", sizeof(char), strlen("OPENCCDATRIE"), fp); fwrite(&lexicon_length, sizeof(size_t), 1, fp); fwrite(&item_count, sizeof(size_t), 1, fp); for (i = 0; i < lexicon_count; i ++) { fwrite(lexicon[i].value, sizeof(ucs4_t), ucs4len(lexicon[i].value) + 1, fp); } fwrite(dat, sizeof(dat[0]), item_count, fp); fclose(fp); }
void init(const char * filename) { dictionary_group_t dictionary_group = dictionary_group_open(NULL); if (dictionary_group_load(dictionary_group, filename, OPENCC_DICTIONARY_TYPE_TEXT) == -1) { dictionary_perror("Dictionary loading error"); fprintf(stderr, _("\n")); exit(1); } dictionary_t t_dictionary = dictionary_group_get_dictionary(dictionary_group, 0); if (t_dictionary == (dictionary_t) -1) { dictionary_perror("Dictionary loading error"); fprintf(stderr, _("\n")); exit(1); } static entry tlexicon[DATRIE_WORD_MAX_COUNT]; /* TODO add datrie support */ dictionary_t dictionary = dictionary_get(t_dictionary); lexicon_count = dictionary_text_get_lexicon(dictionary, tlexicon); qsort(tlexicon, lexicon_count, sizeof(tlexicon[0]), cmp); size_t i; size_t lexicon_cursor = 0; for (i = 0; i < lexicon_count; i ++) { lexicon[i].key = tlexicon[i].key; lexicon[i].length = ucs4len(lexicon[i].key); size_t j; for (j = 0; tlexicon[i].value[j] != NULL; j ++); lexicon[i].value_count = j; lexicon_index_length += lexicon[i].value_count + 1; lexicon[i].value = (value_t *) malloc(lexicon[i].value_count * sizeof(value_t)); for (j = 0; j < lexicon[i].value_count; j ++) { lexicon[i].value[j].cursor = lexicon_cursor; lexicon[i].value[j].pointer = tlexicon[i].value[j]; lexicon_cursor += ucs4len(tlexicon[i].value[j]) + 1; } } lexicon_cursor_end = lexicon_cursor; }
void output(const char * file_name) { FILE * fp = fopen(file_name, "wb"); if (!fp) { fprintf(stderr, _("Can not write file: %s\n"), file_name); exit(1); } uint32_t i, item_count; for (i = DATRIE_SIZE - 1; i > 0; i --) if (dat[i].parent != DATRIE_UNUSED) break; item_count = i + 1; fwrite("OPENCCDATRIE", sizeof(char), strlen("OPENCCDATRIE"), fp); /* 詞彙表長度 */ fwrite(&lexicon_cursor_end, sizeof(uint32_t), 1, fp); for (i = 0; i < lexicon_count; i ++) { size_t j; for (j = 0; j < lexicon[i].value_count; j ++) { fwrite(lexicon[i].value[j].pointer, sizeof(ucs4_t), ucs4len(lexicon[i].value[j].pointer) + 1, fp); } } /* 詞彙索引表長度 */ fwrite(&lexicon_index_length, sizeof(uint32_t), 1, fp); for (i = 0; i < lexicon_count; i ++) { size_t j; for (j = 0; j < lexicon[i].value_count; j ++) { fwrite(&lexicon[i].value[j].cursor, sizeof(uint32_t), 1, fp); } uint32_t dem = (uint32_t) -1; fwrite(&dem, sizeof(uint32_t), 1, fp); /* 分隔符 */ } fwrite(&lexicon_count, sizeof(uint32_t), 1, fp); fwrite(&item_count, sizeof(uint32_t), 1, fp); fwrite(dat, sizeof(dat[0]), item_count, fp); fclose(fp); }
void init(const char * filename) { dictionary_group_t dictionary_group = dictionary_group_open(); if (dictionary_group_load(dictionary_group, filename, OPENCC_DICTIONARY_TYPE_TEXT) == -1) { dictionary_perror("Dictionary loading error"); fprintf(stderr, _("\n")); exit(1); } dictionary_t t_dictionary = dictionary_group_get_dictionary(dictionary_group, 0); if (t_dictionary == (dictionary_t) -1) { dictionary_perror("Dictionary loading error"); fprintf(stderr, _("\n")); exit(1); } static opencc_entry tlexicon[DATRIE_WORD_MAX_COUNT]; /* TODO add datrie support */ dictionary_t dictionary = dictionary_get(t_dictionary); lexicon_count = dictionary_text_get_lexicon(dictionary, tlexicon); qsort(tlexicon, lexicon_count, sizeof(tlexicon[0]), cmp); size_t i; lexicon[0].pos = 0; for (i = 0; i < lexicon_count; i ++) { lexicon[i].key = tlexicon[i].key; lexicon[i].value = tlexicon[i].value; lexicon[i].length = ucs4len(lexicon[i].key); if (i > 0) { lexicon[i].pos = lexicon[i-1].pos + ucs4len(lexicon[i-1].value) + 1; } } }
static size_t segment(converter_desc * converter, ucs4_t ** inbuf, size_t * inbuf_left, ucs4_t ** outbuf, size_t * outbuf_left) { /* 正向最大分詞 */ size_t inbuf_left_start = *inbuf_left; for (; **inbuf && *inbuf_left > 0 && *outbuf_left > 0;) { size_t match_len; const ucs4_t * match_rs = dictionary_group_match_longest( converter->current_dictionary_group, *inbuf, *inbuf_left, &match_len ); if (match_rs == NULL) { **outbuf = **inbuf; (*outbuf) ++, (*outbuf_left) --; (*inbuf) ++, (*inbuf_left) --; } else { /* 輸出緩衝區剩餘空間小於分詞長度 */ if (ucs4len(match_rs) > *outbuf_left) { if (inbuf_left_start - *inbuf_left > 0) break; errnum = CONVERTER_ERROR_OUTBUF; return (size_t) -1; } for (; *match_rs; match_rs ++) { **outbuf = *match_rs; (*outbuf) ++,(*outbuf_left) --; } *inbuf += match_len; *inbuf_left -= match_len; } } return inbuf_left_start - *inbuf_left; }
size_t dict_text_get_all_match_lengths(Dict* dict, const ucs4_t* word, size_t* match_length) { TextDict* text_dictionary = (TextDict*)dict; size_t rscnt = 0; if (text_dictionary->entry_count == 0) { return rscnt; } size_t length = ucs4len(word); size_t len = text_dictionary->max_length; if (length < len) { len = length; } ucs4ncpy(text_dictionary->word_buff, word, len); text_dictionary->word_buff[len] = L'\0'; TextEntry buff; buff.key = text_dictionary->word_buff; for (; len > 0; len--) { text_dictionary->word_buff[len] = L'\0'; TextEntry* brs = (TextEntry*)bsearch( &buff, text_dictionary->lexicon, text_dictionary->entry_count, sizeof(text_dictionary->lexicon[0]), qsort_entry_cmp ); if (brs != NULL) { match_length[rscnt++] = len; } } return rscnt; }
char* opencc_convert_utf8(opencc_t t_opencc, const char* inbuf, size_t length) { if (!lib_initialized) { lib_initialize(); } size_t actual_length = strlen(inbuf); if ((length == (size_t)-1) || (length > actual_length)) { length = actual_length; } ucs4_t* winbuf = utf8_to_ucs4(inbuf, length); if (winbuf == (ucs4_t*)-1) { /* Can not convert input UTF8 to UCS4 */ errnum = OPENCC_ERROR_ENCODING; return (char*)-1; } /* Set up UTF8 buffer */ size_t outbuf_len = length; size_t outsize = outbuf_len; char* original_outbuf = (char*)malloc(sizeof(char) * (outbuf_len + 1)); char* outbuf = original_outbuf; original_outbuf[0] = '\0'; /* Set conversion buffer */ size_t wbufsize = length + 64; ucs4_t* woutbuf = (ucs4_t*)malloc(sizeof(ucs4_t) * (wbufsize + 1)); ucs4_t* pinbuf = winbuf; ucs4_t* poutbuf = woutbuf; size_t inbuf_left, outbuf_left; inbuf_left = ucs4len(winbuf); outbuf_left = wbufsize; while (inbuf_left > 0) { size_t retval = opencc_convert(t_opencc, &pinbuf, &inbuf_left, &poutbuf, &outbuf_left); if (retval == (size_t)-1) { free(outbuf); free(winbuf); free(woutbuf); return (char*)-1; } *poutbuf = L'\0'; char* ubuff = ucs4_to_utf8(woutbuf, (size_t)-1); if (ubuff == (char*)-1) { free(outbuf); free(winbuf); free(woutbuf); errnum = OPENCC_ERROR_ENCODING; return (char*)-1; } size_t ubuff_len = strlen(ubuff); while (ubuff_len > outsize) { size_t outbuf_offset = outbuf - original_outbuf; outsize += outbuf_len; outbuf_len += outbuf_len; original_outbuf = (char*)realloc(original_outbuf, sizeof(char) * outbuf_len); outbuf = original_outbuf + outbuf_offset; } strncpy(outbuf, ubuff, ubuff_len); free(ubuff); outbuf += ubuff_len; *outbuf = '\0'; outbuf_left = wbufsize; poutbuf = woutbuf; } free(winbuf); free(woutbuf); original_outbuf = (char*)realloc(original_outbuf, sizeof(char) * (strlen(original_outbuf) + 1)); return original_outbuf; }
char * opencc_convert_utf8(opencc_t t_opencc, const char * inbuf, size_t length) { if (!lib_initialized) lib_initialize(); if (length == (size_t) -1 || length > strlen(inbuf)) length = strlen(inbuf); /* 將輸入數據轉換爲ucs4_t字符串 */ ucs4_t * winbuf = utf8_to_ucs4(inbuf, length); if (winbuf == (ucs4_t *) -1) { /* 輸入數據轉換失敗 */ errnum = OPENCC_ERROR_ENCODIND; return (char *) -1; } /* 設置輸出UTF8文本緩衝區空間 */ size_t outbuf_len = length; size_t outsize = outbuf_len; char * original_outbuf = (char *) malloc(sizeof(char) * (outbuf_len + 1)); char * outbuf = original_outbuf; original_outbuf[0] = '\0'; /* 設置轉換緩衝區空間 */ size_t wbufsize = length + 64; ucs4_t * woutbuf = (ucs4_t *) malloc(sizeof(ucs4_t) * (wbufsize + 1)); ucs4_t * pinbuf = winbuf; ucs4_t * poutbuf = woutbuf; size_t inbuf_left, outbuf_left; inbuf_left = ucs4len(winbuf); outbuf_left = wbufsize; while (inbuf_left > 0) { size_t retval = opencc_convert(t_opencc, &pinbuf, &inbuf_left, &poutbuf, &outbuf_left); if (retval == (size_t) -1) { free(outbuf); free(winbuf); free(woutbuf); return (char *) -1; } *poutbuf = L'\0'; char * ubuff = ucs4_to_utf8(woutbuf, (size_t) -1); if (ubuff == (char *) -1) { free(outbuf); free(winbuf); free(woutbuf); errnum = OPENCC_ERROR_ENCODIND; return (char *) -1; } size_t ubuff_len = strlen(ubuff); while (ubuff_len > outsize) { size_t outbuf_offset = outbuf - original_outbuf; outsize += outbuf_len; outbuf_len += outbuf_len; original_outbuf = (char *) realloc(original_outbuf, sizeof(char) * outbuf_len); outbuf = original_outbuf + outbuf_offset; } strncpy(outbuf, ubuff, ubuff_len); free(ubuff); outbuf += ubuff_len; *outbuf = '\0'; outbuf_left = wbufsize; poutbuf = woutbuf; } free(winbuf); free(woutbuf); original_outbuf = (char *) realloc(original_outbuf, sizeof(char) * (strlen(original_outbuf) + 1)); return original_outbuf; }
Dict* dict_text_new(const char* filename) { TextDict* text_dictionary; text_dictionary = (TextDict*)malloc(sizeof(TextDict)); text_dictionary->entry_count = INITIAL_DICTIONARY_SIZE; text_dictionary->max_length = 0; text_dictionary->lexicon = (TextEntry*)malloc( sizeof(TextEntry) * text_dictionary->entry_count); text_dictionary->word_buff = NULL; static char buff[ENTRY_BUFF_SIZE]; FILE* fp = fopen(filename, "r"); if (fp == NULL) { dict_text_delete((Dict*)text_dictionary); return (Dict*)-1; } skip_utf8_bom(fp); size_t i = 0; while (fgets(buff, ENTRY_BUFF_SIZE, fp)) { if (i >= text_dictionary->entry_count) { text_dictionary->entry_count += text_dictionary->entry_count; text_dictionary->lexicon = (TextEntry*)realloc( text_dictionary->lexicon, sizeof(TextEntry) * text_dictionary->entry_count ); } if (parse_entry(buff, text_dictionary->lexicon + i) == -1) { text_dictionary->entry_count = i; dict_text_delete((Dict*)text_dictionary); return (Dict*)-1; } size_t length = ucs4len(text_dictionary->lexicon[i].key); if (length > text_dictionary->max_length) { text_dictionary->max_length = length; } i++; } fclose(fp); text_dictionary->entry_count = i; text_dictionary->lexicon = (TextEntry*)realloc( text_dictionary->lexicon, sizeof(TextEntry) * text_dictionary->entry_count ); text_dictionary->word_buff = (ucs4_t*) malloc(sizeof(ucs4_t) * (text_dictionary->max_length + 1)); qsort(text_dictionary->lexicon, text_dictionary->entry_count, sizeof(text_dictionary->lexicon[0]), qsort_entry_cmp ); return (Dict*)text_dictionary; }
static size_t sp_seg(converter_desc * converter, ucs4_t ** inbuf, size_t * inbuf_left, ucs4_t ** outbuf, size_t * outbuf_left, size_t length) { /* 最短路徑分詞 */ /* 對長度爲1時特殊優化 */ if (length == 1) { const ucs4_t * match_rs = dictionary_group_match_longest( converter->current_dictionary_group, *inbuf, 1, NULL ); if (match_rs == NULL) { **outbuf = **inbuf; (*outbuf) ++,(*outbuf_left) --; (*inbuf) ++,(*inbuf_left) --; } else { if (ucs4len(match_rs) > *outbuf_left) { errnum = CONVERTER_ERROR_OUTBUF; return (size_t) -1; } for (; *match_rs; match_rs ++) { **outbuf = *match_rs; (*outbuf) ++,(*outbuf_left) --; } (*inbuf) ++; (*inbuf_left) --; } /* 必須保證有一個字符空間 */ return 1; } /* 設置緩衝區空間 */ spseg_buffer_desc * ossb = &(converter->spseg_buffer); size_t buffer_size_need = length + 1; if (ossb->initialized == FALSE || ossb->buffer_size < buffer_size_need) sp_seg_set_buffer_size(&(converter->spseg_buffer), buffer_size_need); size_t i, j; for (i = 0; i <= length; i ++) ossb->min_len[i] = INFINITY_INT; ossb->min_len[0] = ossb->parent[0] = 0; for (i = 0; i < length; i ++) { /* 獲取所有匹配長度 */ size_t match_count = dictionary_group_get_all_match_lengths( converter->current_dictionary_group, (*inbuf) + i, ossb->match_length ); if (ossb->match_length[0] != 1) ossb->match_length[match_count ++] = 1; /* 動態規劃求最短分割路徑 */ for (j = 0; j < match_count; j ++) { size_t k = ossb->match_length[j]; ossb->match_length[j] = 0; if (k > 1 && ossb->min_len[i] + 1 <= ossb->min_len[i + k]) { ossb->min_len[i + k] = ossb->min_len[i] + 1; ossb->parent[i + k] = i; } else if (k == 1 && ossb->min_len[i] + 1 < ossb->min_len[i + k]) { ossb->min_len[i + k] = ossb->min_len[i] + 1; ossb->parent[i + k] = i; } } } /* 取得最短分割路徑 */ for (i = length, j = ossb->min_len[length]; i != 0; i = ossb->parent[i]) ossb->path[--j] = i; size_t inbuf_left_start = *inbuf_left; size_t begin, end; /* 根據最短分割路徑轉換 */ for (i = begin = 0; i < ossb->min_len[length]; i ++) { end = ossb->path[i]; size_t match_len; const ucs4_t * match_rs = dictionary_group_match_longest( converter->current_dictionary_group, *inbuf, end - begin, &match_len ); if (match_rs == NULL) { **outbuf = **inbuf; (*outbuf) ++, (*outbuf_left) --; (*inbuf) ++, (*inbuf_left) --; } else { /* 輸出緩衝區剩餘空間小於分詞長度 */ if (ucs4len(match_rs) > *outbuf_left) { if (inbuf_left_start - *inbuf_left > 0) break; errnum = CONVERTER_ERROR_OUTBUF; return (size_t) -1; } for (; *match_rs; match_rs ++) { **outbuf = *match_rs; (*outbuf) ++,(*outbuf_left) --; } *inbuf += match_len; *inbuf_left -= match_len; } begin = end; } return inbuf_left_start - *inbuf_left; }