static void check_mecab_dictionary_encoding(grn_ctx *ctx) { #ifdef HAVE_MECAB_DICTIONARY_INFO_T mecab_t *mecab; mecab = mecab_new2("-Owakati"); if (mecab) { grn_encoding encoding; grn_bool have_same_encoding_dictionary; encoding = GRN_CTX_GET_ENCODING(ctx); have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab)); mecab_destroy(mecab); if (!have_same_encoding_dictionary) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "MeCab has no dictionary that uses the context encoding" ": <%s>", grn_encoding_to_string(encoding)); } } else { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_new2 failed in check_mecab_dictionary_encoding: %s", mecab_global_error_message()); } #endif }
static void check_mecab_dictionary_encoding(grn_ctx *ctx) { #ifdef HAVE_MECAB_DICTIONARY_INFO_T mecab_t *mecab; mecab = mecab_new2("-Owakati"); if (mecab) { grn_encoding encoding; int have_same_encoding_dictionary = 0; encoding = GRN_CTX_GET_ENCODING(ctx); have_same_encoding_dictionary = encoding == get_mecab_encoding(mecab); mecab_destroy(mecab); if (!have_same_encoding_dictionary) { ERR(GRN_TOKENIZER_ERROR, "MeCab has no dictionary that uses the context encoding: <%s>", grn_enctostr(encoding)); } } else { ERR(GRN_TOKENIZER_ERROR, "mecab_new2 failed in check_mecab_dictionary_encoding: %s", mecab_strerror(NULL)); } #endif }
grn_obj *FieldNormalizer::normalize(const char *string, unsigned int string_length) { MRN_DBUG_ENTER_METHOD(); grn_obj *normalizer = find_grn_normalizer(); int flags = 0; grn_encoding original_encoding = GRN_CTX_GET_ENCODING(ctx_); encoding::set(ctx_, field_->charset()); grn_obj *grn_string = grn_string_open(ctx_, string, string_length, normalizer, flags); GRN_CTX_SET_ENCODING(ctx_, original_encoding); DBUG_RETURN(grn_string); }
grn_rc grn_db_init_mecab_tokenizer(grn_ctx *ctx) { switch (GRN_CTX_GET_ENCODING(ctx)) { case GRN_ENC_EUC_JP : case GRN_ENC_UTF8 : case GRN_ENC_SJIS : return grn_plugin_register(ctx, "tokenizers/mecab"); default : return GRN_OPERATION_NOT_SUPPORTED; } }
static void stem_filter(grn_ctx *ctx, grn_token *current_token, grn_token *next_token, void *user_data) { grn_stem_token_filter *token_filter = user_data; grn_obj *data; if (GRN_CTX_GET_ENCODING(ctx) != GRN_ENC_UTF8) { return; } data = grn_token_get_data(ctx, current_token); if (token_filter->stemmer) { sb_stemmer_delete(token_filter->stemmer); } { /* TODO: Detect algorithm from the current token. */ const char *algorithm = "english"; const char *encoding = "UTF_8"; token_filter->stemmer = sb_stemmer_new(algorithm, encoding); if (!token_filter->stemmer) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[token-filter][stem] " "failed to create stemmer: " "algorithm=<%s>, encoding=<%s>", algorithm, encoding); return; } } { const sb_symbol *stemmed; stemmed = sb_stemmer_stem(token_filter->stemmer, GRN_TEXT_VALUE(data), GRN_TEXT_LEN(data)); if (stemmed) { grn_token_set_data(ctx, next_token, stemmed, sb_stemmer_length(token_filter->stemmer)); } else { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[token-filter][stem] " "failed to allocate memory for stemmed word: <%.*s>", (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data)); return; } } }
grn_obj *FieldNormalizer::normalize(const char *string, unsigned int string_length) { MRN_DBUG_ENTER_METHOD(); grn_obj normalizer; GRN_TEXT_INIT(&normalizer, 0); find_grn_normalizer(&normalizer); int flags = 0; grn_encoding original_encoding = GRN_CTX_GET_ENCODING(ctx_); encoding::set_raw(ctx_, field_->charset()); grn_obj *grn_string; if (GRN_TEXT_VALUE(&normalizer)[GRN_TEXT_LEN(&normalizer) - 1] == ')') { if (!lexicon_) { lexicon_ = grn_table_create(ctx_, NULL, 0, NULL, GRN_OBJ_TABLE_PAT_KEY, grn_ctx_at(ctx_, GRN_DB_SHORT_TEXT), NULL); } grn_obj_set_info(ctx_, lexicon_, GRN_INFO_NORMALIZER, &normalizer); grn_string = grn_string_open(ctx_, string, string_length, lexicon_, flags); } else { grn_string = grn_string_open(ctx_, string, string_length, grn_ctx_get(ctx_, GRN_TEXT_VALUE(&normalizer), GRN_TEXT_LEN(&normalizer)), flags); } GRN_OBJ_FIN(ctx_, &normalizer); GRN_CTX_SET_ENCODING(ctx_, original_encoding); DBUG_RETURN(grn_string); }
grn_rc grn_db_init_mecab_tokenizer(grn_ctx *ctx) { switch (GRN_CTX_GET_ENCODING(ctx)) { case GRN_ENC_EUC_JP : case GRN_ENC_UTF8 : case GRN_ENC_SJIS : { const char *mecab_plugin_name = "tokenizers/mecab"; char *path; path = grn_plugin_find_path(ctx, mecab_plugin_name); if (path) { GRN_FREE(path); return grn_plugin_register(ctx, mecab_plugin_name); } else { return GRN_NO_SUCH_FILE_OR_DIRECTORY; } } break; default : return GRN_OPERATION_NOT_SUPPORTED; } }
/* * call-seq: * context.encoding -> Groonga::Encoding * * コンテキストが使うエンコーディングを返す。 */ static VALUE rb_grn_context_get_encoding (VALUE self) { return GRNENCODING2RVAL(GRN_CTX_GET_ENCODING(SELF(self))); }