コード例 #1
0
ファイル: opencc_dict.c プロジェクト: tornadory/OpenCC
void output(const char * file_name)
{
	FILE * fp = fopen(file_name, "wb");

	if (!fp)
	{
		fprintf(stderr, _("Can not write file: %s\n"), file_name);
		exit(1);
	}

	size_t i, item_count;
	
	for (i = DATRIE_SIZE - 1; i > 0; i --)
		if (dat[i].parent != DATRIE_UNUSED)
			break;
	item_count = i + 1;

	size_t lexicon_length = lexicon[lexicon_count - 1].pos +
			ucs4len(lexicon[lexicon_count - 1].value) + 1;

	fwrite("OPENCCDATRIE", sizeof(char), strlen("OPENCCDATRIE"), fp);
	fwrite(&lexicon_length, sizeof(size_t), 1, fp);
	fwrite(&item_count, sizeof(size_t), 1, fp);

	for (i = 0; i < lexicon_count; i ++)
	{
		fwrite(lexicon[i].value, sizeof(ucs4_t), ucs4len(lexicon[i].value) + 1, fp);
	}
	
	fwrite(dat, sizeof(dat[0]), item_count, fp);
	
	fclose(fp);
}
コード例 #2
0
ファイル: opencc_dict.c プロジェクト: crazyangelo/OpenCC
void init(const char * filename)
{
	dictionary_group_t dictionary_group = dictionary_group_open(NULL);

	if (dictionary_group_load(dictionary_group, filename, OPENCC_DICTIONARY_TYPE_TEXT) == -1)
	{
		dictionary_perror("Dictionary loading error");
		fprintf(stderr, _("\n"));
		exit(1);
	}

	dictionary_t t_dictionary = dictionary_group_get_dictionary(dictionary_group, 0);
	if (t_dictionary == (dictionary_t) -1)
	{
		dictionary_perror("Dictionary loading error");
		fprintf(stderr, _("\n"));
		exit(1);
	}

	static entry tlexicon[DATRIE_WORD_MAX_COUNT];

	/* TODO add datrie support */
	dictionary_t dictionary = dictionary_get(t_dictionary);
	lexicon_count = dictionary_text_get_lexicon(dictionary, tlexicon);

	qsort(tlexicon, lexicon_count, sizeof(tlexicon[0]), cmp);

	size_t i;
	size_t lexicon_cursor = 0;
	for (i = 0; i < lexicon_count; i ++)
	{
		lexicon[i].key = tlexicon[i].key;
		lexicon[i].length = ucs4len(lexicon[i].key);

		size_t j;
		for (j = 0; tlexicon[i].value[j] != NULL; j ++);
		lexicon[i].value_count = j;
		lexicon_index_length += lexicon[i].value_count + 1;

		lexicon[i].value = (value_t *) malloc(lexicon[i].value_count * sizeof(value_t));
		for (j = 0; j < lexicon[i].value_count; j ++)
		{
			lexicon[i].value[j].cursor = lexicon_cursor;
			lexicon[i].value[j].pointer = tlexicon[i].value[j];
			lexicon_cursor += ucs4len(tlexicon[i].value[j]) + 1;
		}
	}

	lexicon_cursor_end = lexicon_cursor;
}
コード例 #3
0
ファイル: opencc_dict.c プロジェクト: crazyangelo/OpenCC
void output(const char * file_name)
{
	FILE * fp = fopen(file_name, "wb");

	if (!fp)
	{
		fprintf(stderr, _("Can not write file: %s\n"), file_name);
		exit(1);
	}

	uint32_t i, item_count;
	
	for (i = DATRIE_SIZE - 1; i > 0; i --)
		if (dat[i].parent != DATRIE_UNUSED)
			break;
	item_count = i + 1;

	fwrite("OPENCCDATRIE", sizeof(char), strlen("OPENCCDATRIE"), fp);

	/* 詞彙表長度 */
	fwrite(&lexicon_cursor_end, sizeof(uint32_t), 1, fp);
	for (i = 0; i < lexicon_count; i ++)
	{
		size_t j;
		for (j = 0; j < lexicon[i].value_count; j ++)
		{
			fwrite(lexicon[i].value[j].pointer, sizeof(ucs4_t),
					ucs4len(lexicon[i].value[j].pointer) + 1, fp);
		}

	}

	/* 詞彙索引表長度 */
	fwrite(&lexicon_index_length, sizeof(uint32_t), 1, fp);
	for (i = 0; i < lexicon_count; i ++)
	{
		size_t j;
		for (j = 0; j < lexicon[i].value_count; j ++)
		{
			fwrite(&lexicon[i].value[j].cursor, sizeof(uint32_t), 1, fp);
		}
		uint32_t dem = (uint32_t) -1;
		fwrite(&dem, sizeof(uint32_t), 1, fp); /* 分隔符 */
	}

	fwrite(&lexicon_count, sizeof(uint32_t), 1, fp);

	fwrite(&item_count, sizeof(uint32_t), 1, fp);

	fwrite(dat, sizeof(dat[0]), item_count, fp);
	
	fclose(fp);
}
コード例 #4
0
ファイル: opencc_dict.c プロジェクト: tornadory/OpenCC
void init(const char * filename)
{
	dictionary_group_t dictionary_group = dictionary_group_open();

	if (dictionary_group_load(dictionary_group, filename, OPENCC_DICTIONARY_TYPE_TEXT) == -1)
	{
		dictionary_perror("Dictionary loading error");
		fprintf(stderr, _("\n"));
		exit(1);
	}

	dictionary_t t_dictionary = dictionary_group_get_dictionary(dictionary_group, 0);
	if (t_dictionary == (dictionary_t) -1)
	{
		dictionary_perror("Dictionary loading error");
		fprintf(stderr, _("\n"));
		exit(1);
	}

	static opencc_entry tlexicon[DATRIE_WORD_MAX_COUNT];

	/* TODO add datrie support */
	dictionary_t dictionary = dictionary_get(t_dictionary);
	lexicon_count = dictionary_text_get_lexicon(dictionary, tlexicon);

	qsort(tlexicon, lexicon_count, sizeof(tlexicon[0]), cmp);

	size_t i;
	lexicon[0].pos = 0;
	for (i = 0; i < lexicon_count; i ++)
	{
		lexicon[i].key = tlexicon[i].key;
		lexicon[i].value = tlexicon[i].value;
		lexicon[i].length = ucs4len(lexicon[i].key);
		if (i > 0)
		{
			lexicon[i].pos = lexicon[i-1].pos + ucs4len(lexicon[i-1].value) + 1;
		}
	}
}
コード例 #5
0
ファイル: converter.c プロジェクト: tornadory/OpenCC
static size_t segment(converter_desc * converter,
		ucs4_t ** inbuf, size_t * inbuf_left,
		ucs4_t ** outbuf, size_t * outbuf_left)
{
	/* 正向最大分詞 */
	size_t inbuf_left_start = *inbuf_left;

	for (; **inbuf && *inbuf_left > 0 && *outbuf_left > 0;)
	{
		size_t match_len;
		const ucs4_t * match_rs = dictionary_group_match_longest(
				converter->current_dictionary_group,
				*inbuf,
				*inbuf_left,
				&match_len
		);

		if (match_rs == NULL)
		{
			**outbuf = **inbuf;
			(*outbuf) ++, (*outbuf_left) --;
			(*inbuf) ++, (*inbuf_left) --;
		}
		else
		{
			/* 輸出緩衝區剩餘空間小於分詞長度 */
			if (ucs4len(match_rs) > *outbuf_left)
			{
				if (inbuf_left_start - *inbuf_left > 0)
					break;
				errnum = CONVERTER_ERROR_OUTBUF;
				return (size_t) -1;
			}

			for (; *match_rs; match_rs ++)
			{
				**outbuf = *match_rs;
				(*outbuf) ++,(*outbuf_left) --;
			}

			*inbuf += match_len;
			*inbuf_left -= match_len;
		}
	}

	return inbuf_left_start - *inbuf_left;
}
コード例 #6
0
ファイル: text.c プロジェクト: Axure/librime
size_t dict_text_get_all_match_lengths(Dict* dict,
                                       const ucs4_t* word,
                                       size_t* match_length) {
  TextDict* text_dictionary = (TextDict*)dict;

  size_t rscnt = 0;

  if (text_dictionary->entry_count == 0) {
    return rscnt;
  }

  size_t length = ucs4len(word);
  size_t len = text_dictionary->max_length;

  if (length < len) {
    len = length;
  }

  ucs4ncpy(text_dictionary->word_buff, word, len);
  text_dictionary->word_buff[len] = L'\0';

  TextEntry buff;
  buff.key = text_dictionary->word_buff;

  for (; len > 0; len--) {
    text_dictionary->word_buff[len] = L'\0';
    TextEntry* brs = (TextEntry*)bsearch(
      &buff,
      text_dictionary->lexicon,
      text_dictionary->entry_count,
      sizeof(text_dictionary->lexicon[0]),
      qsort_entry_cmp
      );

    if (brs != NULL) {
      match_length[rscnt++] = len;
    }
  }

  return rscnt;
}
コード例 #7
0
ファイル: opencc.c プロジェクト: johnnywjy/OpenCC
char* opencc_convert_utf8(opencc_t t_opencc, const char* inbuf, size_t length) {
  if (!lib_initialized) {
    lib_initialize();
  }
	size_t actual_length = strlen(inbuf);
  if ((length == (size_t)-1) || (length > actual_length)) {
    length = actual_length;
  }
  ucs4_t* winbuf = utf8_to_ucs4(inbuf, length);
  if (winbuf == (ucs4_t*)-1) {
    /* Can not convert input UTF8 to UCS4 */
    errnum = OPENCC_ERROR_ENCODING;
    return (char*)-1;
  }
  /* Set up UTF8 buffer */
  size_t outbuf_len = length;
  size_t outsize = outbuf_len;
  char* original_outbuf = (char*)malloc(sizeof(char) * (outbuf_len + 1));
  char* outbuf = original_outbuf;
  original_outbuf[0] = '\0';
  /* Set conversion buffer */
  size_t wbufsize = length + 64;
  ucs4_t* woutbuf = (ucs4_t*)malloc(sizeof(ucs4_t) * (wbufsize + 1));
  ucs4_t* pinbuf = winbuf;
  ucs4_t* poutbuf = woutbuf;
  size_t inbuf_left, outbuf_left;
  inbuf_left = ucs4len(winbuf);
  outbuf_left = wbufsize;
  while (inbuf_left > 0) {
    size_t retval = opencc_convert(t_opencc,
                                   &pinbuf,
                                   &inbuf_left,
                                   &poutbuf,
                                   &outbuf_left);
    if (retval == (size_t)-1) {
      free(outbuf);
      free(winbuf);
      free(woutbuf);
      return (char*)-1;
    }
    *poutbuf = L'\0';
    char* ubuff = ucs4_to_utf8(woutbuf, (size_t)-1);
    if (ubuff == (char*)-1) {
      free(outbuf);
      free(winbuf);
      free(woutbuf);
      errnum = OPENCC_ERROR_ENCODING;
      return (char*)-1;
    }
    size_t ubuff_len = strlen(ubuff);
    while (ubuff_len > outsize) {
      size_t outbuf_offset = outbuf - original_outbuf;
      outsize += outbuf_len;
      outbuf_len += outbuf_len;
      original_outbuf =
        (char*)realloc(original_outbuf, sizeof(char) * outbuf_len);
      outbuf = original_outbuf + outbuf_offset;
    }
    strncpy(outbuf, ubuff, ubuff_len);
    free(ubuff);
    outbuf += ubuff_len;
    *outbuf = '\0';
    outbuf_left = wbufsize;
    poutbuf = woutbuf;
  }
  free(winbuf);
  free(woutbuf);
  original_outbuf = (char*)realloc(original_outbuf,
                                   sizeof(char) * (strlen(original_outbuf) + 1));
  return original_outbuf;
}
コード例 #8
0
ファイル: opencc.c プロジェクト: izenecloud/icma
char * opencc_convert_utf8(opencc_t t_opencc, const char * inbuf, size_t length)
{
    if (!lib_initialized)
        lib_initialize();

    if (length == (size_t) -1 || length > strlen(inbuf))
        length = strlen(inbuf);

    /* 將輸入數據轉換爲ucs4_t字符串 */
    ucs4_t * winbuf = utf8_to_ucs4(inbuf, length);
    if (winbuf == (ucs4_t *) -1)
    {
        /* 輸入數據轉換失敗 */
        errnum = OPENCC_ERROR_ENCODIND;
        return (char *) -1;
    }

    /* 設置輸出UTF8文本緩衝區空間 */
    size_t outbuf_len = length;
    size_t outsize = outbuf_len;
    char * original_outbuf = (char *) malloc(sizeof(char) * (outbuf_len + 1));
    char * outbuf = original_outbuf;
    original_outbuf[0] = '\0';

    /* 設置轉換緩衝區空間 */
    size_t wbufsize = length + 64;
    ucs4_t * woutbuf = (ucs4_t *) malloc(sizeof(ucs4_t) * (wbufsize + 1));

    ucs4_t * pinbuf = winbuf;
    ucs4_t * poutbuf = woutbuf;
    size_t inbuf_left, outbuf_left;

    inbuf_left = ucs4len(winbuf);
    outbuf_left = wbufsize;

    while (inbuf_left > 0)
    {
        size_t retval = opencc_convert(t_opencc, &pinbuf, &inbuf_left, &poutbuf, &outbuf_left);
        if (retval == (size_t) -1)
        {
            free(outbuf);
            free(winbuf);
            free(woutbuf);
            return (char *) -1;
        }

        *poutbuf = L'\0';

        char * ubuff = ucs4_to_utf8(woutbuf, (size_t) -1);

        if (ubuff == (char *) -1)
        {
            free(outbuf);
            free(winbuf);
            free(woutbuf);
            errnum = OPENCC_ERROR_ENCODIND;
            return (char *) -1;
        }

        size_t ubuff_len = strlen(ubuff);

        while (ubuff_len > outsize)
        {
            size_t outbuf_offset = outbuf - original_outbuf;
            outsize += outbuf_len;
            outbuf_len += outbuf_len;
            original_outbuf = (char *) realloc(original_outbuf, sizeof(char) * outbuf_len);
            outbuf = original_outbuf + outbuf_offset;
        }

        strncpy(outbuf, ubuff, ubuff_len);
        free(ubuff);

        outbuf += ubuff_len;
        *outbuf = '\0';

        outbuf_left = wbufsize;
        poutbuf = woutbuf;
    }

    free(winbuf);
    free(woutbuf);

    original_outbuf = (char *) realloc(original_outbuf,
                                       sizeof(char) * (strlen(original_outbuf) + 1));

    return original_outbuf;
}
コード例 #9
0
ファイル: text.c プロジェクト: Axure/librime
Dict* dict_text_new(const char* filename) {
  TextDict* text_dictionary;

  text_dictionary = (TextDict*)malloc(sizeof(TextDict));
  text_dictionary->entry_count = INITIAL_DICTIONARY_SIZE;
  text_dictionary->max_length = 0;
  text_dictionary->lexicon = (TextEntry*)malloc(
    sizeof(TextEntry) * text_dictionary->entry_count);
  text_dictionary->word_buff = NULL;

  static char buff[ENTRY_BUFF_SIZE];

  FILE* fp = fopen(filename, "r");

  if (fp == NULL) {
    dict_text_delete((Dict*)text_dictionary);
    return (Dict*)-1;
  }
  skip_utf8_bom(fp);

  size_t i = 0;

  while (fgets(buff, ENTRY_BUFF_SIZE, fp)) {
    if (i >= text_dictionary->entry_count) {
      text_dictionary->entry_count += text_dictionary->entry_count;
      text_dictionary->lexicon = (TextEntry*)realloc(
        text_dictionary->lexicon,
        sizeof(TextEntry) * text_dictionary->entry_count
        );
    }

    if (parse_entry(buff, text_dictionary->lexicon + i) == -1) {
      text_dictionary->entry_count = i;
      dict_text_delete((Dict*)text_dictionary);
      return (Dict*)-1;
    }

    size_t length = ucs4len(text_dictionary->lexicon[i].key);

    if (length > text_dictionary->max_length) {
      text_dictionary->max_length = length;
    }

    i++;
  }

  fclose(fp);

  text_dictionary->entry_count = i;
  text_dictionary->lexicon = (TextEntry*)realloc(
    text_dictionary->lexicon,
    sizeof(TextEntry) * text_dictionary->entry_count
    );
  text_dictionary->word_buff = (ucs4_t*)
                               malloc(sizeof(ucs4_t) *
                                      (text_dictionary->max_length + 1));

  qsort(text_dictionary->lexicon,
        text_dictionary->entry_count,
        sizeof(text_dictionary->lexicon[0]),
        qsort_entry_cmp
        );

  return (Dict*)text_dictionary;
}
コード例 #10
0
ファイル: converter.c プロジェクト: tornadory/OpenCC
static size_t sp_seg(converter_desc * converter, ucs4_t ** inbuf, size_t * inbuf_left,
		ucs4_t ** outbuf, size_t * outbuf_left, size_t length)
{
	/* 最短路徑分詞 */
	
	/* 對長度爲1時特殊優化 */
	if (length == 1)
	{
		const ucs4_t * match_rs = dictionary_group_match_longest(
				converter->current_dictionary_group,
				*inbuf,
				1,
				NULL
		);
		
		if (match_rs == NULL)
		{
			**outbuf = **inbuf;
			(*outbuf) ++,(*outbuf_left) --;
			(*inbuf) ++,(*inbuf_left) --;
		}
		else
		{
			if (ucs4len(match_rs) > *outbuf_left)
			{
				errnum = CONVERTER_ERROR_OUTBUF;
				return (size_t) -1;
			}
			for (; *match_rs; match_rs ++)
			{
				**outbuf = *match_rs;
				(*outbuf) ++,(*outbuf_left) --;
			}
			(*inbuf) ++;	(*inbuf_left) --;
		}

		/* 必須保證有一個字符空間 */
		return 1;
	}
	
	/* 設置緩衝區空間 */
	spseg_buffer_desc * ossb = &(converter->spseg_buffer);
	size_t buffer_size_need = length + 1;
	if (ossb->initialized == FALSE || ossb->buffer_size < buffer_size_need)
		sp_seg_set_buffer_size(&(converter->spseg_buffer), buffer_size_need);
	
	size_t i, j;

	for (i = 0; i <= length; i ++)
		ossb->min_len[i] = INFINITY_INT;
	
	ossb->min_len[0] = ossb->parent[0] = 0;
	
	for (i = 0; i < length; i ++)
	{
		/* 獲取所有匹配長度 */
		size_t match_count = dictionary_group_get_all_match_lengths(
				converter->current_dictionary_group,
				(*inbuf) + i,
				ossb->match_length
		);
		
		if (ossb->match_length[0] != 1)
			ossb->match_length[match_count ++] = 1;
		
		/* 動態規劃求最短分割路徑 */
		for (j = 0; j < match_count; j ++)
		{
			size_t k = ossb->match_length[j];
			ossb->match_length[j] = 0;
			
			if (k > 1 && ossb->min_len[i] + 1 <= ossb->min_len[i + k])
			{
				ossb->min_len[i + k] = ossb->min_len[i] + 1;
				ossb->parent[i + k] = i;
			}
			else if (k == 1 && ossb->min_len[i] + 1 < ossb->min_len[i + k])
			{
				ossb->min_len[i + k] = ossb->min_len[i] + 1;
				ossb->parent[i + k] = i;
			}
		}
	}
	
	/* 取得最短分割路徑 */
	for (i = length, j = ossb->min_len[length]; i != 0; i = ossb->parent[i])
		ossb->path[--j] = i;
	
	size_t inbuf_left_start = *inbuf_left;
	size_t begin, end;

	/* 根據最短分割路徑轉換 */
	for (i = begin = 0; i < ossb->min_len[length]; i ++)
	{
		end = ossb->path[i];
		
		size_t match_len;
		const ucs4_t * match_rs = dictionary_group_match_longest(
				converter->current_dictionary_group,
				*inbuf,
				end - begin,
				&match_len
		);

		if (match_rs == NULL)
		{
			**outbuf = **inbuf;
			(*outbuf) ++, (*outbuf_left) --;
			(*inbuf) ++, (*inbuf_left) --;
		}
		else
		{
			/* 輸出緩衝區剩餘空間小於分詞長度 */
			if (ucs4len(match_rs) > *outbuf_left)
			{
				if (inbuf_left_start - *inbuf_left > 0)
					break;
				errnum = CONVERTER_ERROR_OUTBUF;
				return (size_t) -1;
			}

			for (; *match_rs; match_rs ++)
			{
				**outbuf = *match_rs;
				(*outbuf) ++,(*outbuf_left) --;
			}

			*inbuf += match_len;
			*inbuf_left -= match_len;
		}
		
		begin = end;
	}
	
	return inbuf_left_start - *inbuf_left;
}