static void check_mecab_dictionary_encoding(grn_ctx *ctx) { #ifdef HAVE_MECAB_DICTIONARY_INFO_T mecab_t *mecab; mecab = mecab_new2("-Owakati"); if (mecab) { grn_encoding encoding; grn_bool have_same_encoding_dictionary; encoding = GRN_CTX_GET_ENCODING(ctx); have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab)); mecab_destroy(mecab); if (!have_same_encoding_dictionary) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "MeCab has no dictionary that uses the context encoding" ": <%s>", grn_encoding_to_string(encoding)); } } else { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_new2 failed in check_mecab_dictionary_encoding: %s", mecab_global_error_message()); } #endif }
void grn_test_assert_equal_encoding_helper (grn_encoding expected, grn_encoding actual, const gchar *expression_expected, const gchar *expression_actual) { if (expected == actual) { cut_test_pass(); } else { cut_test_fail(cut_take_printf("<%s> == <%s>\n" "expected: <%s>\n" " but was: <%s>", expression_expected, expression_actual, grn_encoding_to_string(expected), grn_encoding_to_string(actual))); } }
/* This function is called for a full text search query or a document to be indexed. This means that both short/long strings are given. The return value of this function is ignored. When an error occurs in this function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). */ static grn_obj * mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_mecab_tokenizer *tokenizer; unsigned int normalizer_flags = 0; grn_tokenizer_query *query; grn_obj *normalized_query; const char *normalized_string; unsigned int normalized_string_length; query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags); if (!query) { return NULL; } if (!sole_mecab) { grn_plugin_mutex_lock(ctx, sole_mecab_mutex); if (!sole_mecab) { sole_mecab = mecab_new2("-Owakati"); if (!sole_mecab) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_new2() failed on mecab_init(): %s", mecab_global_error_message()); } else { sole_mecab_encoding = get_mecab_encoding(sole_mecab); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); } if (!sole_mecab) { grn_tokenizer_query_close(ctx, query); return NULL; } if (query->encoding != sole_mecab_encoding) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "MeCab dictionary charset (%s) does not match " "the table encoding: <%s>", grn_encoding_to_string(sole_mecab_encoding), grn_encoding_to_string(query->encoding)); return NULL; } if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][mecab] " "memory allocation to grn_mecab_tokenizer failed"); return NULL; } tokenizer->mecab = sole_mecab; tokenizer->query = query; normalized_query = query->normalized_query; grn_string_get_normalized(ctx, normalized_query, &normalized_string, &normalized_string_length, NULL); GRN_TEXT_INIT(&(tokenizer->buf), 0); if (query->have_tokenized_delimiter) { tokenizer->next = normalized_string; tokenizer->end = tokenizer->next + normalized_string_length; } else if (normalized_string_length == 0) { tokenizer->next = ""; tokenizer->end = tokenizer->next; } else { grn_bool succeeded; grn_plugin_mutex_lock(ctx, sole_mecab_mutex); if (grn_mecab_chunked_tokenize_enabled && ctx->encoding == GRN_ENC_UTF8) { succeeded = chunked_tokenize_utf8(ctx, tokenizer, normalized_string, normalized_string_length); } else { const char *s; s = mecab_sparse_tostr2(tokenizer->mecab, normalized_string, normalized_string_length); if (!s) { succeeded = GRN_FALSE; GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_sparse_tostr() failed len=%d err=%s", normalized_string_length, mecab_strerror(tokenizer->mecab)); } else { succeeded = GRN_TRUE; GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); if (!succeeded) { grn_tokenizer_query_close(ctx, tokenizer->query); GRN_PLUGIN_FREE(ctx, tokenizer); return NULL; } { char *buf, *p; unsigned int bufsize; buf = GRN_TEXT_VALUE(&(tokenizer->buf)); bufsize = GRN_TEXT_LEN(&(tokenizer->buf)); /* A certain version of mecab returns trailing lf or spaces. */ for (p = buf + bufsize - 2; buf <= p && isspace(*(unsigned char *)p); p--) { *p = '\0'; } tokenizer->next = buf; tokenizer->end = p + 1; } } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); return NULL; }