示例#1
0
static void
check_mecab_dictionary_encoding(grn_ctx *ctx)
{
#ifdef HAVE_MECAB_DICTIONARY_INFO_T
  mecab_t *mecab;

  mecab = mecab_new2("-Owakati");
  if (mecab) {
    grn_encoding encoding;
    grn_bool have_same_encoding_dictionary;

    encoding = GRN_CTX_GET_ENCODING(ctx);
    have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab));
    mecab_destroy(mecab);

    if (!have_same_encoding_dictionary) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                       "[tokenizer][mecab] "
                       "MeCab has no dictionary that uses the context encoding"
                       ": <%s>",
                       grn_encoding_to_string(encoding));
    }
  } else {
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "mecab_new2 failed in check_mecab_dictionary_encoding: %s",
                     mecab_global_error_message());
  }
#endif
}
示例#2
0
void
grn_test_assert_equal_encoding_helper (grn_encoding expected,
                                       grn_encoding actual,
                                       const gchar *expression_expected,
                                       const gchar *expression_actual)
{
  if (expected == actual) {
    cut_test_pass();
  } else {
    cut_test_fail(cut_take_printf("<%s> == <%s>\n"
                                  "expected: <%s>\n"
                                  " but was: <%s>",
                                  expression_expected,
                                  expression_actual,
                                  grn_encoding_to_string(expected),
                                  grn_encoding_to_string(actual)));
  }
}
示例#3
0
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_mecab_tokenizer *tokenizer;
  unsigned int normalizer_flags = 0;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
  if (!query) {
    return NULL;
  }
  if (!sole_mecab) {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_new2() failed on mecab_init(): %s",
                         mecab_global_error_message());
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
  }
  if (!sole_mecab) {
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }

  if (query->encoding != sole_mecab_encoding) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "MeCab dictionary charset (%s) does not match "
                     "the table encoding: <%s>",
                     grn_encoding_to_string(sole_mecab_encoding),
                     grn_encoding_to_string(query->encoding));
    return NULL;
  }

  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][mecab] "
                     "memory allocation to grn_mecab_tokenizer failed");
    return NULL;
  }
  tokenizer->mecab = sole_mecab;
  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  GRN_TEXT_INIT(&(tokenizer->buf), 0);
  if (query->have_tokenized_delimiter) {
    tokenizer->next = normalized_string;
    tokenizer->end = tokenizer->next + normalized_string_length;
  } else if (normalized_string_length == 0) {
    tokenizer->next = "";
    tokenizer->end = tokenizer->next;
  } else {
    grn_bool succeeded;
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (grn_mecab_chunked_tokenize_enabled &&
        ctx->encoding == GRN_ENC_UTF8) {
      succeeded = chunked_tokenize_utf8(ctx,
                                        tokenizer,
                                        normalized_string,
                                        normalized_string_length);
    } else {
      const char *s;
      s = mecab_sparse_tostr2(tokenizer->mecab,
                              normalized_string,
                              normalized_string_length);
      if (!s) {
        succeeded = GRN_FALSE;
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_sparse_tostr() failed len=%d err=%s",
                         normalized_string_length,
                         mecab_strerror(tokenizer->mecab));
      } else {
        succeeded = GRN_TRUE;
        GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
    if (!succeeded) {
      grn_tokenizer_query_close(ctx, tokenizer->query);
      GRN_PLUGIN_FREE(ctx, tokenizer);
      return NULL;
    }
    {
      char *buf, *p;
      unsigned int bufsize;

      buf = GRN_TEXT_VALUE(&(tokenizer->buf));
      bufsize = GRN_TEXT_LEN(&(tokenizer->buf));
      /* A certain version of mecab returns trailing lf or spaces. */
      for (p = buf + bufsize - 2;
           buf <= p && isspace(*(unsigned char *)p);
           p--) { *p = '\0'; }
      tokenizer->next = buf;
      tokenizer->end = p + 1;
    }
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}