Ejemplo n.º 1
0
static void
check_mecab_dictionary_encoding(grn_ctx *ctx)
{
#ifdef HAVE_MECAB_DICTIONARY_INFO_T
  mecab_t *mecab;

  mecab = mecab_new2("-Owakati");
  if (mecab) {
    grn_encoding encoding;
    grn_bool have_same_encoding_dictionary;

    encoding = GRN_CTX_GET_ENCODING(ctx);
    have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab));
    mecab_destroy(mecab);

    if (!have_same_encoding_dictionary) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                       "[tokenizer][mecab] "
                       "MeCab has no dictionary that uses the context encoding"
                       ": <%s>",
                       grn_encoding_to_string(encoding));
    }
  } else {
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "mecab_new2 failed in check_mecab_dictionary_encoding: %s",
                     mecab_global_error_message());
  }
#endif
}
Ejemplo n.º 2
0
static void
check_mecab_dictionary_encoding(grn_ctx *ctx)
{
#ifdef HAVE_MECAB_DICTIONARY_INFO_T
  mecab_t *mecab;

  mecab = mecab_new2("-Owakati");
  if (mecab) {
    grn_encoding encoding;
    int have_same_encoding_dictionary = 0;

    encoding = GRN_CTX_GET_ENCODING(ctx);
    have_same_encoding_dictionary = encoding == get_mecab_encoding(mecab);
    mecab_destroy(mecab);

    if (!have_same_encoding_dictionary) {
      ERR(GRN_TOKENIZER_ERROR,
          "MeCab has no dictionary that uses the context encoding: <%s>",
          grn_enctostr(encoding));
    }
  } else {
    ERR(GRN_TOKENIZER_ERROR,
        "mecab_new2 failed in check_mecab_dictionary_encoding: %s",
        mecab_strerror(NULL));
  }
#endif
}
Ejemplo n.º 3
0
 grn_obj *FieldNormalizer::normalize(const char *string,
                                     unsigned int string_length) {
   MRN_DBUG_ENTER_METHOD();
   grn_obj *normalizer = find_grn_normalizer();
   int flags = 0;
   grn_encoding original_encoding = GRN_CTX_GET_ENCODING(ctx_);
   encoding::set(ctx_, field_->charset());
   grn_obj *grn_string = grn_string_open(ctx_, string, string_length,
                                         normalizer, flags);
   GRN_CTX_SET_ENCODING(ctx_, original_encoding);
   DBUG_RETURN(grn_string);
 }
Ejemplo n.º 4
0
Archivo: token.c Proyecto: mooz/groonga
grn_rc
grn_db_init_mecab_tokenizer(grn_ctx *ctx)
{
  switch (GRN_CTX_GET_ENCODING(ctx)) {
  case GRN_ENC_EUC_JP :
  case GRN_ENC_UTF8 :
  case GRN_ENC_SJIS :
    return grn_plugin_register(ctx, "tokenizers/mecab");
  default :
    return GRN_OPERATION_NOT_SUPPORTED;
  }
}
Ejemplo n.º 5
0
static void
stem_filter(grn_ctx *ctx,
            grn_token *current_token,
            grn_token *next_token,
            void *user_data)
{
  grn_stem_token_filter *token_filter = user_data;
  grn_obj *data;

  if (GRN_CTX_GET_ENCODING(ctx) != GRN_ENC_UTF8) {
    return;
  }

  data = grn_token_get_data(ctx, current_token);

  if (token_filter->stemmer) {
    sb_stemmer_delete(token_filter->stemmer);
  }
  {
    /* TODO: Detect algorithm from the current token. */
    const char *algorithm = "english";
    const char *encoding = "UTF_8";
    token_filter->stemmer = sb_stemmer_new(algorithm, encoding);
    if (!token_filter->stemmer) {
      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                       "[token-filter][stem] "
                       "failed to create stemmer: "
                       "algorithm=<%s>, encoding=<%s>",
                       algorithm, encoding);
      return;
    }
  }

  {
    const sb_symbol *stemmed;

    stemmed = sb_stemmer_stem(token_filter->stemmer,
                              GRN_TEXT_VALUE(data), GRN_TEXT_LEN(data));
    if (stemmed) {
      grn_token_set_data(ctx, next_token,
                         stemmed,
                         sb_stemmer_length(token_filter->stemmer));
    } else {
      GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                       "[token-filter][stem] "
                       "failed to allocate memory for stemmed word: <%.*s>",
                       (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data));
      return;
    }
  }
}
Ejemplo n.º 6
0
 grn_obj *FieldNormalizer::normalize(const char *string,
                                     unsigned int string_length) {
   MRN_DBUG_ENTER_METHOD();
   grn_obj normalizer;
   GRN_TEXT_INIT(&normalizer, 0);
   find_grn_normalizer(&normalizer);
   int flags = 0;
   grn_encoding original_encoding = GRN_CTX_GET_ENCODING(ctx_);
   encoding::set_raw(ctx_, field_->charset());
   grn_obj *grn_string;
   if (GRN_TEXT_VALUE(&normalizer)[GRN_TEXT_LEN(&normalizer) - 1] == ')') {
     if (!lexicon_) {
       lexicon_ = grn_table_create(ctx_,
                                   NULL, 0,
                                   NULL,
                                   GRN_OBJ_TABLE_PAT_KEY,
                                   grn_ctx_at(ctx_, GRN_DB_SHORT_TEXT),
                                   NULL);
     }
     grn_obj_set_info(ctx_, lexicon_, GRN_INFO_NORMALIZER, &normalizer);
     grn_string = grn_string_open(ctx_,
                                  string,
                                  string_length,
                                  lexicon_,
                                  flags);
   } else {
     grn_string = grn_string_open(ctx_,
                                  string,
                                  string_length,
                                  grn_ctx_get(ctx_,
                                              GRN_TEXT_VALUE(&normalizer),
                                              GRN_TEXT_LEN(&normalizer)),
                                  flags);
   }
   GRN_OBJ_FIN(ctx_, &normalizer);
   GRN_CTX_SET_ENCODING(ctx_, original_encoding);
   DBUG_RETURN(grn_string);
 }
Ejemplo n.º 7
0
grn_rc
grn_db_init_mecab_tokenizer(grn_ctx *ctx)
{
  switch (GRN_CTX_GET_ENCODING(ctx)) {
  case GRN_ENC_EUC_JP :
  case GRN_ENC_UTF8 :
  case GRN_ENC_SJIS :
    {
      const char *mecab_plugin_name = "tokenizers/mecab";
      char *path;
      path = grn_plugin_find_path(ctx, mecab_plugin_name);
      if (path) {
        GRN_FREE(path);
        return grn_plugin_register(ctx, mecab_plugin_name);
      } else {
        return GRN_NO_SUCH_FILE_OR_DIRECTORY;
      }
    }
    break;
  default :
    return GRN_OPERATION_NOT_SUPPORTED;
  }
}
Ejemplo n.º 8
0
/*
 * call-seq:
 *   context.encoding -> Groonga::Encoding
 *
 * コンテキストが使うエンコーディングを返す。
 */
static VALUE
rb_grn_context_get_encoding (VALUE self)
{
    return GRNENCODING2RVAL(GRN_CTX_GET_ENCODING(SELF(self)));
}