コード例 #1
0
ファイル: mecab.c プロジェクト: asmlib/mariadb-server
static void
check_mecab_dictionary_encoding(grn_ctx *ctx)
{
#ifdef HAVE_MECAB_DICTIONARY_INFO_T
  mecab_t *mecab;

  mecab = mecab_new2("-Owakati");
  if (mecab) {
    grn_encoding encoding;
    grn_bool have_same_encoding_dictionary;

    encoding = GRN_CTX_GET_ENCODING(ctx);
    have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab));
    mecab_destroy(mecab);

    if (!have_same_encoding_dictionary) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                       "[tokenizer][mecab] "
                       "MeCab has no dictionary that uses the context encoding"
                       ": <%s>",
                       grn_encoding_to_string(encoding));
    }
  } else {
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "mecab_new2 failed in check_mecab_dictionary_encoding: %s",
                     mecab_global_error_message());
  }
#endif
}
コード例 #2
0
ファイル: tokenizer.c プロジェクト: groonga/groonga
grn_tokenizer_query *
grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
                         unsigned int normalize_flags)
{
  grn_obj *flags;
  grn_obj *query_str;
  grn_obj *tokenize_mode;

  GRN_API_ENTER;

  flags = grn_ctx_pop(ctx);
  query_str = grn_ctx_pop(ctx);
  tokenize_mode = grn_ctx_pop(ctx);

  if (query_str == NULL) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument");
    GRN_API_RETURN(NULL);
  }

  if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer");
    GRN_API_RETURN(NULL);
  }

  {
    grn_tokenizer_query * const query =
        GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query));
    if (!query) {
      GRN_API_RETURN(NULL);
    }
    grn_tokenizer_query_init(ctx, query);
    grn_tokenizer_query_set_raw_string(ctx,
                                       query,
                                       GRN_TEXT_VALUE(query_str),
                                       GRN_TEXT_LEN(query_str));
    if (ctx->rc != GRN_SUCCESS) {
      GRN_PLUGIN_FREE(ctx, query);
      GRN_API_RETURN(NULL);
    }
    if (flags) {
      grn_tokenizer_query_set_flags(ctx, query, GRN_UINT32_VALUE(flags));
    }
    if (tokenize_mode) {
      grn_tokenizer_query_set_mode(ctx, query, GRN_UINT32_VALUE(tokenize_mode));
    }
    grn_tokenizer_query_set_normalize_flags(ctx, query, normalize_flags);
    grn_tokenizer_query_set_lexicon(ctx, query, args[0]);

    grn_tokenizer_query_ensure_have_tokenized_delimiter(ctx, query);

    GRN_API_RETURN(query);
  }
}
コード例 #3
0
ファイル: stem.c プロジェクト: tamano/groonga
static void
stem_filter(grn_ctx *ctx,
            grn_token *current_token,
            grn_token *next_token,
            void *user_data)
{
  grn_stem_token_filter *token_filter = user_data;
  grn_obj *data;

  if (GRN_CTX_GET_ENCODING(ctx) != GRN_ENC_UTF8) {
    return;
  }

  data = grn_token_get_data(ctx, current_token);

  if (token_filter->stemmer) {
    sb_stemmer_delete(token_filter->stemmer);
  }
  {
    /* TODO: Detect algorithm from the current token. */
    const char *algorithm = "english";
    const char *encoding = "UTF_8";
    token_filter->stemmer = sb_stemmer_new(algorithm, encoding);
    if (!token_filter->stemmer) {
      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                       "[token-filter][stem] "
                       "failed to create stemmer: "
                       "algorithm=<%s>, encoding=<%s>",
                       algorithm, encoding);
      return;
    }
  }

  {
    const sb_symbol *stemmed;

    stemmed = sb_stemmer_stem(token_filter->stemmer,
                              GRN_TEXT_VALUE(data), GRN_TEXT_LEN(data));
    if (stemmed) {
      grn_token_set_data(ctx, next_token,
                         stemmed,
                         sb_stemmer_length(token_filter->stemmer));
    } else {
      GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                       "[token-filter][stem] "
                       "failed to allocate memory for stemmed word: <%.*s>",
                       (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data));
      return;
    }
  }
}
コード例 #4
0
ファイル: mecab.c プロジェクト: asmlib/mariadb-server
static grn_bool
chunked_tokenize_utf8_chunk(grn_ctx *ctx,
                            grn_mecab_tokenizer *tokenizer,
                            const char *chunk,
                            unsigned int chunk_bytes)
{
  const char *tokenized_chunk;
  size_t tokenized_chunk_length;

  tokenized_chunk = mecab_sparse_tostr2(tokenizer->mecab, chunk, chunk_bytes);
  if (!tokenized_chunk) {
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab][chunk] "
                     "mecab_sparse_tostr2() failed len=%d err=%s",
                     chunk_bytes,
                     mecab_strerror(tokenizer->mecab));
    return GRN_FALSE;
  }

  if (GRN_TEXT_LEN(&(tokenizer->buf)) > 0) {
    GRN_TEXT_PUTS(ctx, &(tokenizer->buf), " ");
  }

  tokenized_chunk_length = strlen(tokenized_chunk);
  if (tokenized_chunk_length >= 1 &&
      isspace(tokenized_chunk[tokenized_chunk_length - 1])) {
    GRN_TEXT_PUT(ctx, &(tokenizer->buf),
                 tokenized_chunk, tokenized_chunk_length - 1);
  } else {
    GRN_TEXT_PUT(ctx, &(tokenizer->buf),
                 tokenized_chunk, tokenized_chunk_length);
  }

  return GRN_TRUE;
}
コード例 #5
0
ファイル: token_filter.c プロジェクト: AkioKanno/groonga
grn_rc
grn_token_filter_register(grn_ctx *ctx,
                          const char *plugin_name_ptr,
                          int plugin_name_length,
                          grn_token_filter_init_func *init,
                          grn_token_filter_filter_func *filter,
                          grn_token_filter_fin_func *fin)
{
  if (plugin_name_length == -1) {
    plugin_name_length = strlen(plugin_name_ptr);
  }

  {
    grn_obj *token_filter_object = grn_proc_create(ctx,
                                                   plugin_name_ptr,
                                                   plugin_name_length,
                                                   GRN_PROC_TOKENIZER,
                                                   NULL, NULL, NULL, 0, NULL);
    if (token_filter_object == NULL) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKEN_FILTER_ERROR,
                       "[token-filter][%.*s] failed to grn_proc_create()",
                       plugin_name_length, plugin_name_ptr);
      return ctx->rc;
    }

    {
      grn_proc *token_filter = (grn_proc *)token_filter_object;
      token_filter->callbacks.token_filter.init = init;
      token_filter->callbacks.token_filter.filter = filter;
      token_filter->callbacks.token_filter.fin = fin;
    }
  }

  return GRN_SUCCESS;
}
コード例 #6
0
ファイル: tokenizer.c プロジェクト: AkioKanno/groonga
grn_rc
grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr,
                       unsigned int plugin_name_length,
                       grn_proc_func *init, grn_proc_func *next,
                       grn_proc_func *fin)
{
  grn_expr_var vars[] = {
    { NULL, 0 },
    { NULL, 0 },
    { NULL, 0 }
  };
  GRN_TEXT_INIT(&vars[0].value, 0);
  GRN_TEXT_INIT(&vars[1].value, 0);
  GRN_UINT32_INIT(&vars[2].value, 0);

  {
    /*
      grn_proc_create() registers a plugin to the database which is associated
      with `ctx'. A returned object must not be finalized here.
     */
    grn_obj * const obj = grn_proc_create(ctx, plugin_name_ptr,
                                          plugin_name_length,
                                          GRN_PROC_TOKENIZER,
                                          init, next, fin, 3, vars);
    if (obj == NULL) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "grn_proc_create() failed");
      return ctx->rc;
    }
  }
  return GRN_SUCCESS;
}
コード例 #7
0
ファイル: tokenizer.c プロジェクト: groonga/groonga
static void
grn_tokenizer_query_ensure_normalized(grn_ctx *ctx, grn_tokenizer_query *query)
{
  if (!query->need_normalize) {
    return;
  }

  query->need_normalize = GRN_FALSE;

  if (query->normalized_query) {
    grn_obj_close(ctx, query->normalized_query);
  }
  query->normalized_query = grn_string_open_(ctx,
                                             query->ptr,
                                             query->length,
                                             query->lexicon,
                                             query->normalize_flags,
                                             query->encoding);
  if (!query->normalized_query) {
    query->have_tokenized_delimiter = GRN_FALSE;
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][normalize] failed to open normalized string");
    return;
  }

  query->need_delimiter_check = GRN_TRUE;
}
コード例 #8
0
ファイル: tokenizer.c プロジェクト: groonga/groonga
grn_obj *
grn_tokenizer_create(grn_ctx *ctx,
                     const char *name,
                     int name_length)
{
  grn_obj *tokenizer;

  GRN_API_ENTER;
  tokenizer = grn_proc_create(ctx,
                              name,
                              name_length,
                              GRN_PROC_TOKENIZER,
                              NULL,
                              NULL,
                              NULL,
                              0,
                              NULL);
  if (!tokenizer) {
    if (name_length < 0) {
      name_length = strlen(name);
    }
    GRN_PLUGIN_ERROR(ctx,
                     GRN_TOKENIZER_ERROR,
                     "[tokenizer][create] failed to create: <%.*s>",
                     name_length, name);
  }

  GRN_API_RETURN(tokenizer);
}
コード例 #9
0
ファイル: tokenizer.c プロジェクト: groonga/groonga
grn_rc
grn_tokenizer_query_set_raw_string(grn_ctx *ctx,
                                   grn_tokenizer_query *query,
                                   const char *string,
                                   size_t string_length)
{
  GRN_API_ENTER;

  if (query->query_buf) {
    GRN_PLUGIN_FREE(ctx, query->query_buf);
  }

  if (string_length == 0) {
    query->query_buf = NULL;
    query->ptr = NULL;
    query->length = 0;
    query->need_normalize = GRN_TRUE;
  } else {
    query->query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, string_length + 1);
    if (!query->query_buf) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                       "[tokenizer][query] failed to duplicate query");
      GRN_API_RETURN(ctx->rc);
    }
    grn_memcpy(query->query_buf, string, string_length);
    query->query_buf[string_length] = '\0';
    query->ptr = query->query_buf;
    query->length = string_length;
  }

  GRN_API_RETURN(ctx->rc);
}
コード例 #10
0
ファイル: vector.c プロジェクト: digideskio/groonga
static grn_obj *
func_vector_size(grn_ctx *ctx, int n_args, grn_obj **args,
                 grn_user_data *user_data)
{
  grn_obj *target;
  unsigned int size;
  grn_obj *grn_size;

  if (n_args != 1) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                     "vector_size(): wrong number of arguments (%d for 1)",
                     n_args);
    return NULL;
  }

  target = args[0];
  switch (target->header.type) {
  case GRN_VECTOR :
  case GRN_PVECTOR :
  case GRN_UVECTOR :
    size = grn_vector_size(ctx, target);
    break;
  default :
    {
      grn_obj inspected;

      GRN_TEXT_INIT(&inspected, 0);
      grn_inspect(ctx, target, &inspected);
      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                       "vector_size(): target object must be vector: <%.*s>",
                       (int)GRN_TEXT_LEN(&inspected),
                       GRN_TEXT_VALUE(&inspected));
      GRN_OBJ_FIN(ctx, &inspected);
      return NULL;
    }
    break;
  }

  grn_size = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_UINT32, 0);
  if (!grn_size) {
    return NULL;
  }

  GRN_UINT32_SET(ctx, grn_size, size);

  return grn_size;
}
コード例 #11
0
ファイル: proc_highlight.c プロジェクト: groonga/groonga
static grn_obj *
func_highlight_html(grn_ctx *ctx, int nargs, grn_obj **args,
                    grn_user_data *user_data)
{
  grn_obj *highlighted = NULL;
  grn_obj *string;
  grn_obj *lexicon = NULL;
  grn_obj *expression = NULL;
  grn_highlighter *highlighter;
  grn_obj *highlighter_ptr;

  if (!(1 <= nargs && nargs <= 2)) {
    GRN_PLUGIN_ERROR(ctx,
                     GRN_INVALID_ARGUMENT,
                     "highlight_html(): wrong number of arguments (%d for 1..2)",
                     nargs);
    highlighted = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_VOID, 0);
    return highlighted;
  }

  string = args[0];
  if (nargs == 2) {
    lexicon = args[1];
  }

  grn_proc_get_info(ctx, user_data, NULL, NULL, &expression);

  highlighter_ptr = grn_expr_get_var(ctx, expression,
                                     GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME,
                                     strlen(GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME));
  if (highlighter_ptr) {
    highlighter = (grn_highlighter *)GRN_PTR_VALUE(highlighter_ptr);
  } else {
    highlighter_ptr =
      grn_expr_get_or_add_var(ctx, expression,
                              GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME,
                              strlen(GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME));
    GRN_OBJ_FIN(ctx, highlighter_ptr);
    GRN_PTR_INIT(highlighter_ptr, GRN_OBJ_OWN, GRN_DB_OBJECT);

    highlighter = func_highlight_html_create_highlighter(ctx, expression);
    grn_highlighter_set_lexicon(ctx, highlighter, lexicon);
    GRN_PTR_SET(ctx, highlighter_ptr, highlighter);
  }

  highlighted = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_TEXT, 0);
  grn_highlighter_highlight(ctx,
                            highlighter,
                            GRN_TEXT_VALUE(string),
                            GRN_TEXT_LEN(string),
                            highlighted);

  return highlighted;
}
コード例 #12
0
/*
  This function initializes a plugin. This function fails if there is no
  dictionary that uses the context encoding of groonga.
 */
grn_rc
GRN_PLUGIN_INIT(grn_ctx *ctx)
{
  sole_mecab = NULL;
  sole_mecab_mutex = grn_plugin_mutex_open(ctx);
  if (!sole_mecab_mutex) {
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][mecab] grn_plugin_mutex_open() failed");
    return ctx->rc;
  }

  check_mecab_dictionary_encoding(ctx);
  return ctx->rc;
}
コード例 #13
0
ファイル: tokenizer.c プロジェクト: groonga/groonga
grn_rc
grn_tokenizer_set_fin_func(grn_ctx *ctx,
                           grn_obj *tokenizer,
                           grn_tokenizer_fin_func *fin)
{
  GRN_API_ENTER;
  if (tokenizer) {
    ((grn_proc *)tokenizer)->callbacks.tokenizer.fin = fin;
  } else {
    GRN_PLUGIN_ERROR(ctx,
                     GRN_INVALID_ARGUMENT,
                     "[tokenizer][fin][set] tokenizer is NULL");
  }
  GRN_API_RETURN(ctx->rc);
}
コード例 #14
0
ファイル: proc_schema.c プロジェクト: XLPE/groonga
static void
command_schema_column_output_indexes(grn_ctx *ctx, grn_obj *column)
{
  uint32_t i;
  grn_index_datum *index_data = NULL;
  uint32_t n_index_data = 0;

  if (column) {
    n_index_data = grn_column_get_all_index_data(ctx, column, NULL, 0);
    if (n_index_data > 0) {
      index_data = GRN_PLUGIN_MALLOC(ctx,
                                     sizeof(grn_index_datum) * n_index_data);
      if (!index_data) {
        GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                         "[schema] failed to allocate memory for indexes");
        return;
      }
      grn_column_get_all_index_data(ctx, column, index_data, n_index_data);
    }
  }

  grn_ctx_output_array_open(ctx, "indexes", n_index_data);
  for (i = 0; i < n_index_data; i++) {
    grn_obj *lexicon;

    grn_ctx_output_map_open(ctx, "index", 4);

    grn_ctx_output_cstr(ctx, "full_name");
    command_schema_output_name(ctx, index_data[i].index);

    grn_ctx_output_cstr(ctx, "table");
    lexicon = grn_ctx_at(ctx, index_data[i].index->header.domain);
    command_schema_output_name(ctx, lexicon);

    grn_ctx_output_cstr(ctx, "name");
    command_schema_output_column_name(ctx, index_data[i].index);

    grn_ctx_output_cstr(ctx, "section");
    grn_ctx_output_uint64(ctx, index_data[i].section);

    grn_ctx_output_map_close(ctx);
  }
  grn_ctx_output_array_close(ctx);

  if (index_data) {
    GRN_PLUGIN_FREE(ctx, index_data);
  }
}
コード例 #15
0
ファイル: stem.c プロジェクト: tamano/groonga
static void *
stem_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode)
{
  grn_stem_token_filter *token_filter;

  token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stem_token_filter));
  if (!token_filter) {
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[token-filter][stem] "
                     "failed to allocate grn_stem_token_filter");
    return NULL;
  }

  token_filter->stemmer = NULL;
  grn_tokenizer_token_init(ctx, &(token_filter->token));

  return token_filter;
}
コード例 #16
0
ファイル: mecab.c プロジェクト: asmlib/mariadb-server
/*
  This function initializes a plugin. This function fails if there is no
  dictionary that uses the context encoding of groonga.
 */
grn_rc
GRN_PLUGIN_INIT(grn_ctx *ctx)
{
  {
    char env[GRN_ENV_BUFFER_SIZE];

    grn_getenv("GRN_MECAB_CHUNKED_TOKENIZE_ENABLED",
               env,
               GRN_ENV_BUFFER_SIZE);
    grn_mecab_chunked_tokenize_enabled = (env[0] && strcmp(env, "yes") == 0);
  }

  {
    char env[GRN_ENV_BUFFER_SIZE];

    grn_getenv("GRN_MECAB_CHUNK_SIZE_THRESHOLD",
               env,
               GRN_ENV_BUFFER_SIZE);
    if (env[0]) {
      int threshold = -1;
      const char *end;
      const char *rest;

      end = env + strlen(env);
      threshold = grn_atoi(env, end, &rest);
      if (end > env && end == rest) {
        grn_mecab_chunk_size_threshold = threshold;
      }
    }
  }

  sole_mecab = NULL;
  sole_mecab_mutex = grn_plugin_mutex_open(ctx);
  if (!sole_mecab_mutex) {
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][mecab] grn_plugin_mutex_open() failed");
    return ctx->rc;
  }

  check_mecab_dictionary_encoding(ctx);
  return ctx->rc;
}
コード例 #17
0
static grn_obj *
sample_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  sample_tokenizer *tokenizer;
  unsigned int normalizer_flags = 0;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
  if (!query) {
    return NULL;
  }

  tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(sample_tokenizer));
  if (!tokenizer) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][sample] "
                     "memory allocation to sample_tokenizer failed");
    return NULL;
  }

  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  tokenizer->next = normalized_string;
  tokenizer->rest = normalized_string_length;

  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}
コード例 #18
0
ファイル: proc_tokenize.c プロジェクト: XLPE/groonga
static unsigned int
parse_tokenize_flags(grn_ctx *ctx, grn_obj *flag_names)
{
  unsigned int flags = 0;
  const char *names, *names_end;
  int length;

  names = GRN_TEXT_VALUE(flag_names);
  length = GRN_TEXT_LEN(flag_names);
  names_end = names + length;
  while (names < names_end) {
    if (*names == '|' || *names == ' ') {
      names += 1;
      continue;
    }

#define CHECK_FLAG(name)\
    if (((names_end - names) >= (sizeof(#name) - 1)) &&\
        (!memcmp(names, #name, sizeof(#name) - 1))) {\
      flags |= GRN_TOKEN_CURSOR_ ## name;\
      names += sizeof(#name) - 1;\
      continue;\
    }

    CHECK_FLAG(ENABLE_TOKENIZED_DELIMITER);

#define GRN_TOKEN_CURSOR_NONE 0
    CHECK_FLAG(NONE);
#undef GRN_TOKEN_CURSOR_NONE

    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                     "[tokenize] invalid flag: <%.*s>",
                     (int)(names_end - names), names);
    return 0;
#undef CHECK_FLAG
  }

  return flags;
}
コード例 #19
0
ファイル: command.c プロジェクト: AkioKanno/groonga
grn_rc
grn_command_register(grn_ctx *ctx,
                     const char *command_name,
                     int command_name_size,
                     grn_command_run_func *run,
                     grn_expr_var *vars,
                     unsigned int n_vars,
                     void *user_data)
{
  GRN_API_ENTER;

  if (command_name_size == -1) {
    command_name_size = strlen(command_name);
  }

  {
    grn_obj *command_object;
    command_object = grn_proc_create(ctx,
                                     command_name,
                                     command_name_size,
                                     GRN_PROC_COMMAND,
                                     NULL, NULL, NULL, n_vars, vars);
    if (!command_object) {
      GRN_PLUGIN_ERROR(ctx, GRN_COMMAND_ERROR,
                       "[command][%.*s] failed to grn_proc_create()",
                       command_name_size, command_name);
      GRN_API_RETURN(ctx->rc);
    }

    {
      grn_proc *command = (grn_proc *)command_object;
      command->callbacks.command.run = run;
      command->user_data = user_data;
    }
  }

  GRN_API_RETURN(GRN_SUCCESS);
}
コード例 #20
0
ファイル: proc_highlight.c プロジェクト: ohkubo/groonga
static grn_obj *
func_highlight_create_keywords_table(grn_ctx *ctx,
                                     grn_user_data *user_data,
                                     const char *normalizer_name,
                                     unsigned int normalizer_name_length)
{
  grn_obj *keywords;

  keywords = grn_table_create(ctx, NULL, 0, NULL,
                              GRN_OBJ_TABLE_PAT_KEY,
                              grn_ctx_at(ctx, GRN_DB_SHORT_TEXT),
                              NULL);

  if (normalizer_name_length > 0) {
    grn_obj *normalizer;
    normalizer = grn_ctx_get(ctx,
                             normalizer_name,
                             normalizer_name_length);
    if (!grn_obj_is_normalizer_proc(ctx, normalizer)) {
      grn_obj inspected;
      GRN_TEXT_INIT(&inspected, 0);
      grn_inspect(ctx, &inspected, normalizer);
      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                       "highlight_full() not normalizer: <%.*s>",
                       (int)GRN_TEXT_LEN(&inspected),
                       GRN_TEXT_VALUE(&inspected));
      GRN_OBJ_FIN(ctx, &inspected);
      grn_obj_unlink(ctx, normalizer);
      grn_obj_unlink(ctx, keywords);
      return NULL;
    }
    grn_obj_set_info(ctx, keywords, GRN_INFO_NORMALIZER, normalizer);
    grn_obj_unlink(ctx, normalizer);
  }

  return keywords;
}
コード例 #21
0
ファイル: mecab.c プロジェクト: asmlib/mariadb-server
static grn_bool
chunked_tokenize_utf8(grn_ctx *ctx,
                      grn_mecab_tokenizer *tokenizer,
                      const char *string,
                      unsigned int string_bytes)
{
  const char *chunk_start;
  const char *current;
  const char *last_delimiter;
  const char *string_end = string + string_bytes;
  grn_encoding encoding = tokenizer->query->encoding;

  if (string_bytes < grn_mecab_chunk_size_threshold) {
    return chunked_tokenize_utf8_chunk(ctx,
                                       tokenizer,
                                       string,
                                       string_bytes);
  }

  chunk_start = current = string;
  last_delimiter = NULL;
  while (current < string_end) {
    int space_bytes;
    int character_bytes;
    const char *current_character;

    space_bytes = grn_isspace(current, encoding);
    if (space_bytes > 0) {
      if (chunk_start != current) {
        grn_bool succeeded;
        succeeded = chunked_tokenize_utf8_chunk(ctx,
                                                tokenizer,
                                                chunk_start,
                                                current - chunk_start);
        if (!succeeded) {
          return succeeded;
        }
      }
      current += space_bytes;
      chunk_start = current;
      last_delimiter = NULL;
      continue;
    }

    character_bytes = grn_charlen_(ctx, current, string_end, encoding);
    if (character_bytes == 0) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                       "[tokenizer][mecab][chunk] "
                       "invalid byte sequence: position=%d",
                       (int)(current - string));
      return GRN_FALSE;
    }

    current_character = current;
    current += character_bytes;
    if (is_delimiter_character(ctx, current_character, character_bytes)) {
      last_delimiter = current;
    }

    if ((current - chunk_start) >= grn_mecab_chunk_size_threshold) {
      grn_bool succeeded;
      if (last_delimiter) {
        succeeded = chunked_tokenize_utf8_chunk(ctx,
                                                tokenizer,
                                                chunk_start,
                                                last_delimiter - chunk_start);
        chunk_start = last_delimiter;
      } else {
        succeeded = chunked_tokenize_utf8_chunk(ctx,
                                                tokenizer,
                                                chunk_start,
                                                current - chunk_start);
        chunk_start = current;
      }
      if (!succeeded) {
        return succeeded;
      }
      last_delimiter = NULL;
    }
  }

  if (current == chunk_start) {
    return GRN_TRUE;
  } else {
    return chunked_tokenize_utf8_chunk(ctx,
                                       tokenizer,
                                       chunk_start,
                                       current - chunk_start);
  }
}
コード例 #22
0
ファイル: proc_fuzzy_search.c プロジェクト: XLPE/groonga
static grn_rc
selector_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *index,
                      int nargs, grn_obj **args,
                      grn_obj *res, grn_operator op)
{
  grn_rc rc = GRN_SUCCESS;
  grn_obj *target = NULL;
  grn_obj *obj;
  grn_obj *query;
  uint32_t max_distance = 1;
  uint32_t prefix_length = 0;
  uint32_t prefix_match_size = 0;
  uint32_t max_expansion = 0;
  int flags = 0;
  grn_bool use_sequential_search = GRN_FALSE;

  if ((nargs - 1) < 2) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                     "fuzzy_search(): wrong number of arguments (%d ...)",
                     nargs - 1);
    rc = ctx->rc;
    goto exit;
  }
  obj = args[1];
  query = args[2];

  if (nargs == 4) {
    grn_obj *options = args[3];
    grn_hash_cursor *cursor;
    void *key;
    grn_obj *value;
    int key_size;

    if (options->header.type != GRN_TABLE_HASH_KEY) {
      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                       "fuzzy_search(): "
                       "3rd argument must be object literal: <%.*s>",
                       (int)GRN_TEXT_LEN(options),
                       GRN_TEXT_VALUE(options));
      goto exit;
    }

    cursor = grn_hash_cursor_open(ctx, (grn_hash *)options,
                                  NULL, 0, NULL, 0,
                                  0, -1, 0);
    if (!cursor) {
      GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                       "fuzzy_search(): couldn't open cursor");
      goto exit;
    }
    while (grn_hash_cursor_next(ctx, cursor) != GRN_ID_NIL) {
      grn_hash_cursor_get_key_value(ctx, cursor, &key, &key_size,
                                    (void **)&value);

      if (key_size == 12 && !memcmp(key, "max_distance", 12)) {
        max_distance = GRN_UINT32_VALUE(value);
      } else if (key_size == 13 && !memcmp(key, "prefix_length", 13)) {
        prefix_length = GRN_UINT32_VALUE(value);
      } else if (key_size == 13 && !memcmp(key, "max_expansion", 13)) {
        max_expansion = GRN_UINT32_VALUE(value);
      } else if (key_size == 18 && !memcmp(key, "with_transposition", 18)) {
        if (GRN_BOOL_VALUE(value)) {
          flags |= GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION;
        }
      } else {
        GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                         "invalid option name: <%.*s>",
                         key_size, (char *)key);
        grn_hash_cursor_close(ctx, cursor);
        goto exit;
      }
    }
    grn_hash_cursor_close(ctx, cursor);
  }

  if (index) {
    target = index;
  } else {
    if (obj->header.type == GRN_COLUMN_INDEX) {
      target = obj;
    } else {
      grn_column_index(ctx, obj, GRN_OP_FUZZY, &target, 1, NULL);
    }
  }

  if (target) {
    grn_obj *lexicon;
    use_sequential_search = GRN_TRUE;
    lexicon = grn_ctx_at(ctx, target->header.domain);
    if (lexicon) {
      if (lexicon->header.type == GRN_TABLE_PAT_KEY) {
        use_sequential_search = GRN_FALSE;
      }
      grn_obj_unlink(ctx, lexicon);
    }
  } else {
    if (grn_obj_is_key_accessor(ctx, obj) &&
        table->header.type == GRN_TABLE_PAT_KEY) {
      target = table;
    } else {
      use_sequential_search = GRN_TRUE;
    }
  }

  if (prefix_length) {
    const char *s = GRN_TEXT_VALUE(query);
    const char *e = GRN_BULK_CURR(query);
    const char *p;
    unsigned int cl = 0;
    unsigned int length = 0;
    for (p = s; p < e && (cl = grn_charlen(ctx, p, e)); p += cl) {
      length++;
      if (length > prefix_length) {
        break;
      }
    }
    prefix_match_size = p - s;
  }

  if (use_sequential_search) {
    rc = sequential_fuzzy_search(ctx, table, obj, query,
                                 max_distance, prefix_match_size,
                                 max_expansion, flags, res, op);
    goto exit;
  }

  if (!target) {
    grn_obj inspected;
    GRN_TEXT_INIT(&inspected, 0);
    grn_inspect(ctx, &inspected, target);
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                     "fuzzy_search(): "
                     "column must be COLUMN_INDEX or TABLE_PAT_KEY: <%.*s>",
                     (int)GRN_TEXT_LEN(&inspected),
                     GRN_TEXT_VALUE(&inspected));
    rc = ctx->rc;
    GRN_OBJ_FIN(ctx, &inspected);
  } else {
    grn_search_optarg options = {0};
    options.mode = GRN_OP_FUZZY;
    options.fuzzy.prefix_match_size = prefix_match_size;
    options.fuzzy.max_distance = max_distance;
    options.fuzzy.max_expansion = max_expansion;
    options.fuzzy.flags = flags;
    grn_obj_search(ctx, target, query, res, op, &options);
  }

exit :
  return rc;
}
コード例 #23
0
ファイル: proc_highlight.c プロジェクト: ohkubo/groonga
static grn_obj *
func_highlight(grn_ctx *ctx, int nargs, grn_obj **args,
               grn_user_data *user_data)
{
  grn_obj *highlighted = NULL;

#define N_REQUIRED_ARGS 1
  if (nargs > N_REQUIRED_ARGS) {
    grn_obj *string = args[0];
    grn_bool use_html_escape = GRN_FALSE;
    grn_obj *keywords;
    const char *normalizer_name = "NormalizerAuto";
    unsigned int normalizer_name_length = 14;
    const char *default_open_tag = NULL;
    unsigned int default_open_tag_length = 0;
    const char *default_close_tag = NULL;
    unsigned int default_close_tag_length = 0;
    grn_obj *end_arg = args[nargs - 1];
    int n_args_without_option = nargs;

    if (end_arg->header.type == GRN_TABLE_HASH_KEY) {
      grn_obj *options = end_arg;
      grn_hash_cursor *cursor;
      void *key;
      grn_obj *value;
      int key_size;

      n_args_without_option--;
      cursor = grn_hash_cursor_open(ctx, (grn_hash *)options,
                                    NULL, 0, NULL, 0,
                                    0, -1, 0);
      if (!cursor) {
        GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                         "highlight(): couldn't open cursor");
        goto exit;
      }
      while (grn_hash_cursor_next(ctx, cursor) != GRN_ID_NIL) {
        grn_hash_cursor_get_key_value(ctx, cursor, &key, &key_size,
                                      (void **)&value);
        if (key_size == 10 && !memcmp(key, "normalizer", 10)) {
          normalizer_name = GRN_TEXT_VALUE(value);
          normalizer_name_length = GRN_TEXT_LEN(value);
        } else if (key_size == 11 && !memcmp(key, "html_escape", 11)) {
          if (GRN_BOOL_VALUE(value)) {
            use_html_escape = GRN_TRUE;
          }
        } else if (key_size == 16 && !memcmp(key, "default_open_tag", 16)) {
          default_open_tag = GRN_TEXT_VALUE(value);
          default_open_tag_length = GRN_TEXT_LEN(value);
        } else if (key_size == 17 && !memcmp(key, "default_close_tag", 17)) {
          default_close_tag = GRN_TEXT_VALUE(value);
          default_close_tag_length = GRN_TEXT_LEN(value);
        } else {
          GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid option name: <%.*s>",
                           key_size, (char *)key);
          grn_hash_cursor_close(ctx, cursor);
          goto exit;
        }
      }
      grn_hash_cursor_close(ctx, cursor);
    }

    keywords =
      func_highlight_create_keywords_table(ctx, user_data,
                                           normalizer_name,
                                           normalizer_name_length);

    if (keywords) {
      grn_obj **keyword_args = args + N_REQUIRED_ARGS;
      unsigned int n_keyword_args = n_args_without_option - N_REQUIRED_ARGS;
      if (default_open_tag_length == 0 && default_close_tag_length == 0) {
        highlighted = highlight_keyword_sets(ctx, user_data,
                                             keyword_args, n_keyword_args,
                                             string, keywords, use_html_escape);
      } else {
        unsigned int i;
        for (i = 0; i < n_keyword_args; i++) {
          grn_table_add(ctx, keywords,
                        GRN_TEXT_VALUE(keyword_args[i]),
                        GRN_TEXT_LEN(keyword_args[i]),
                        NULL);
        }
        highlighted = highlight_keywords(ctx, user_data,
                                         string, keywords, use_html_escape,
                                         default_open_tag, default_open_tag_length,
                                         default_close_tag, default_close_tag_length);
      }
    }
  }
#undef N_REQUIRED_ARGS

exit :
  if (!highlighted) {
    highlighted = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_VOID, 0);
  }

  return highlighted;
}
コード例 #24
0
ファイル: tokenizer.c プロジェクト: AkioKanno/groonga
grn_tokenizer_query *
grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
                         unsigned int normalize_flags)
{
  grn_obj *flags = grn_ctx_pop(ctx);
  grn_obj *query_str = grn_ctx_pop(ctx);
  grn_obj *tokenize_mode = grn_ctx_pop(ctx);

  if (query_str == NULL) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument");
    return NULL;
  }

  if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer");
    return NULL;
  }

  {
    grn_tokenizer_query * const query =
        GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query));
    if (query == NULL) {
      return NULL;
    }
    query->normalized_query = NULL;
    query->query_buf = NULL;
    if (flags) {
      query->flags = GRN_UINT32_VALUE(flags);
    } else {
      query->flags = 0;
    }
    if (tokenize_mode) {
      query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode);
    } else {
      query->tokenize_mode = GRN_TOKENIZE_ADD;
    }
    query->token_mode = query->tokenize_mode;

    {
      grn_obj * const table = args[0];
      grn_obj_flags table_flags;
      grn_encoding table_encoding;
      unsigned int query_length = GRN_TEXT_LEN(query_str);
      char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1);
      grn_obj *normalizer = NULL;

      if (query_buf == NULL) {
        GRN_PLUGIN_FREE(ctx, query);
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer] failed to duplicate query");
        return NULL;
      }
      grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL,
                         &normalizer, NULL);
      {
        grn_obj *normalized_query;
        if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
          normalizer = GRN_NORMALIZER_AUTO;
        }
        normalized_query = grn_string_open_(ctx,
                                            GRN_TEXT_VALUE(query_str),
                                            GRN_TEXT_LEN(query_str),
                                            normalizer,
                                            normalize_flags,
                                            table_encoding);
        if (!normalized_query) {
          GRN_PLUGIN_FREE(ctx, query_buf);
          GRN_PLUGIN_FREE(ctx, query);
          GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                           "[tokenizer] failed to open normalized string");
          return NULL;
        }
        query->normalized_query = normalized_query;
        grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length);
        query_buf[query_length] = '\0';
        query->query_buf = query_buf;
        query->ptr = query_buf;
        query->length = query_length;
      }
      query->encoding = table_encoding;

      if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) {
        const char *normalized_string;
        unsigned int normalized_string_length;

        grn_string_get_normalized(ctx,
                                  query->normalized_query,
                                  &normalized_string,
                                  &normalized_string_length,
                                  NULL);
        query->have_tokenized_delimiter =
          grn_tokenizer_have_tokenized_delimiter(ctx,
                                                 normalized_string,
                                                 normalized_string_length,
                                                 query->encoding);
      } else {
        query->have_tokenized_delimiter = GRN_FALSE;
      }
    }
    return query;
  }
}
コード例 #25
0
ファイル: proc_tokenize.c プロジェクト: XLPE/groonga
static grn_obj *
command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_obj *table_name;
  grn_obj *string;
  grn_obj *flag_names;
  grn_obj *mode_name;
  grn_obj *index_column_name;

  table_name = grn_plugin_proc_get_var(ctx, user_data, "table", -1);
  string = grn_plugin_proc_get_var(ctx, user_data, "string", -1);
  flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1);
  mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode", -1);
  index_column_name = grn_plugin_proc_get_var(ctx, user_data, "index_column", -1);

  if (GRN_TEXT_LEN(table_name) == 0) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] table name is missing");
    return NULL;
  }

  if (GRN_TEXT_LEN(string) == 0) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] string is missing");
    return NULL;
  }

  {
    unsigned int flags;
    grn_obj *lexicon;
    grn_obj *index_column = NULL;

    flags = parse_tokenize_flags(ctx, flag_names);
    if (ctx->rc != GRN_SUCCESS) {
      return NULL;
    }

    lexicon = grn_ctx_get(ctx, GRN_TEXT_VALUE(table_name), GRN_TEXT_LEN(table_name));

    if (!lexicon) {
      return NULL;
    }

#define MODE_NAME_EQUAL(name)\
    (GRN_TEXT_LEN(mode_name) == strlen(name) &&\
     memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0)

    if (GRN_TEXT_LEN(index_column_name) > 0) {
      index_column = grn_obj_column(ctx, lexicon,
                                    GRN_TEXT_VALUE(index_column_name),
                                    GRN_TEXT_LEN(index_column_name));
      if (!index_column) {
        GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                         "[table_tokenize] nonexistent index column: <%.*s>",
                         (int)GRN_TEXT_LEN(index_column_name),
                         GRN_TEXT_VALUE(index_column_name));
        goto exit;
      }
      if (index_column->header.type != GRN_COLUMN_INDEX) {
        GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                         "[table_tokenize] index column must be COLUMN_INDEX: <%.*s>",
                         (int)GRN_TEXT_LEN(index_column_name),
                         GRN_TEXT_VALUE(index_column_name));
        goto exit;
      }
    }

    {
      grn_obj tokens;
      GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
    if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("GET")) {
      tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens);
      output_tokens(ctx, &tokens, lexicon, index_column);
    } else if (MODE_NAME_EQUAL("ADD")) {
      tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
      output_tokens(ctx, &tokens, lexicon, index_column);
    } else {
      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                       "[table_tokenize] invalid mode: <%.*s>",
                       (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name));
    }
      GRN_OBJ_FIN(ctx, &tokens);
    }
#undef MODE_NAME_EQUAL

exit:
    grn_obj_unlink(ctx, lexicon);
    if (index_column) {
      grn_obj_unlink(ctx, index_column);
    }
  }

  return NULL;
}
コード例 #26
0
static grn_obj *
yangram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data,
             unsigned short ngram_unit, grn_bool ignore_blank,
             grn_bool split_symbol, grn_bool split_alpha, grn_bool split_digit,
             grn_bool skip_overlap, unsigned short use_vgram)
{
  grn_tokenizer_query *query;
  unsigned int normalize_flags =
    GRN_STRING_WITH_TYPES |
    GRN_STRING_REMOVE_TOKENIZED_DELIMITER |
    GRN_STRING_REMOVE_BLANK;

  const char *normalized;
  unsigned int normalized_length_in_bytes;
  grn_yangram_tokenizer *tokenizer;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
  if (!query) {
    return NULL;
  }
  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_yangram_tokenizer)))) {
    GRN_PLUGIN_ERROR(ctx,GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][yangram] "
                     "memory allocation to grn_yangram_tokenizer failed");
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }
  user_data->ptr = tokenizer;
  grn_tokenizer_token_init(ctx, &(tokenizer->token));
  tokenizer->query = query;
  tokenizer->skip_overlap = skip_overlap;
  tokenizer->ignore_blank = ignore_blank;
  tokenizer->ngram_unit = ngram_unit;
  tokenizer->split_symbol = split_symbol;
  tokenizer->split_alpha = split_alpha;
  tokenizer->split_digit = split_digit;
  tokenizer->use_vgram = use_vgram;
  if (tokenizer->use_vgram > 0) {
    const char *vgram_word_table_name_env;
    vgram_word_table_name_env = getenv("GRN_VGRAM_WORD_TABLE_NAME");

    if (vgram_word_table_name_env) {
      tokenizer->vgram_table = grn_ctx_get(ctx,
                                           vgram_word_table_name_env,
                                           strlen(vgram_word_table_name_env));
    } else {
      tokenizer->vgram_table = grn_ctx_get(ctx,
                                           VGRAM_WORD_TABLE_NAME,
                                           strlen(VGRAM_WORD_TABLE_NAME));
    }
    if (!tokenizer->vgram_table) {
       GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                        "[tokenizer][yangram] "
                        "couldn't open a vgram table");
       tokenizer->vgram_table = NULL;
       return NULL;
    }
  } else {
    tokenizer->vgram_table = NULL;
  }
  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
                            &normalized, &normalized_length_in_bytes,
                            NULL);
  {
    const char *phrase_table_name_env;
    phrase_table_name_env = getenv("GRN_KNOWN_PHRASE_TABLE_NAME");

    if (phrase_table_name_env) {
      tokenizer->phrase_table = grn_ctx_get(ctx,
                                            phrase_table_name_env,
                                            strlen(phrase_table_name_env));
    } else {
      tokenizer->phrase_table = grn_ctx_get(ctx,
                                            KNOWN_PHRASE_TABLE_NAME,
                                            strlen(KNOWN_PHRASE_TABLE_NAME));
    }
    if (tokenizer->phrase_table) {
      if (!(tokenizer->hits =
          GRN_PLUGIN_MALLOC(ctx, sizeof(grn_pat_scan_hit) * MAX_N_HITS))) {
        GRN_PLUGIN_ERROR(ctx,GRN_NO_MEMORY_AVAILABLE,
                        "[tokenizer][yangram] "
                        "memory allocation to grn_pat_scan_hit failed");
        grn_tokenizer_query_close(ctx, query);
        return NULL;
      } else {
        tokenizer->scan_rest = normalized;
        tokenizer->nhits = 0;
        tokenizer->current_hit = 0;
      }
    } else {
     tokenizer->phrase_table = NULL;
    }
  }

  tokenizer->next = (const unsigned char *)normalized;
  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
  tokenizer->rest_length = tokenizer->end - tokenizer->next;
  tokenizer->ctypes =
    grn_string_get_types(ctx, tokenizer->query->normalized_query);

  tokenizer->pushed_token_tail = NULL;
  tokenizer->ctypes_next = 0;

  return NULL;
}
コード例 #27
0
ファイル: proc_tokenize.c プロジェクト: XLPE/groonga
static grn_obj *
command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_obj *tokenizer_name;
  grn_obj *string;
  grn_obj *normalizer_name;
  grn_obj *flag_names;
  grn_obj *mode_name;
  grn_obj *token_filter_names;

  tokenizer_name = grn_plugin_proc_get_var(ctx, user_data, "tokenizer", -1);
  string = grn_plugin_proc_get_var(ctx, user_data, "string", -1);
  normalizer_name = grn_plugin_proc_get_var(ctx, user_data, "normalizer", -1);
  flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1);
  mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode", -1);
  token_filter_names = grn_plugin_proc_get_var(ctx, user_data, "token_filters", -1);

  if (GRN_TEXT_LEN(tokenizer_name) == 0) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] tokenizer name is missing");
    return NULL;
  }

  if (GRN_TEXT_LEN(string) == 0) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] string is missing");
    return NULL;
  }

  {
    unsigned int flags;
    grn_obj *lexicon;

    flags = parse_tokenize_flags(ctx, flag_names);
    if (ctx->rc != GRN_SUCCESS) {
      return NULL;
    }

    lexicon = create_lexicon_for_tokenize(ctx,
                                          tokenizer_name,
                                          normalizer_name,
                                          token_filter_names);
    if (!lexicon) {
      return NULL;
    }
#define MODE_NAME_EQUAL(name)\
    (GRN_TEXT_LEN(mode_name) == strlen(name) &&\
     memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0)

    {
      grn_obj tokens;
      GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
      if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD")) {
        tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
        output_tokens(ctx, &tokens, lexicon, NULL);
      } else if (MODE_NAME_EQUAL("GET")) {
        tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
        GRN_BULK_REWIND(&tokens);
        tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens);
        output_tokens(ctx, &tokens, lexicon, NULL);
      } else {
        GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                         "[tokenize] invalid mode: <%.*s>",
                         (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name));
      }
      GRN_OBJ_FIN(ctx, &tokens);
    }
#undef MODE_NAME_EQUAL

    grn_obj_unlink(ctx, lexicon);
  }

  return NULL;
}
コード例 #28
0
ファイル: proc_snippet.c プロジェクト: cafedomancer/groonga
/* TODO: support caching for the same parameter. */
static grn_obj *
func_snippet(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_obj *snippets = NULL;

#define N_REQUIRED_ARGS 1
#define KEYWORD_SET_SIZE 3
  if (nargs > N_REQUIRED_ARGS) {
    grn_obj *text = args[0];
    grn_obj *end_arg = args[nargs - 1];
    grn_obj *snip = NULL;
    unsigned int width = 200;
    unsigned int max_n_results = 3;
    grn_snip_mapping *mapping = NULL;
    int flags = GRN_SNIP_SKIP_LEADING_SPACES;
    const char *prefix = NULL;
    int prefix_length = 0;
    const char *suffix = NULL;
    int suffix_length = 0;
    const char *normalizer_name = NULL;
    int normalizer_name_length = 0;
    const char *default_open_tag = NULL;
    int default_open_tag_length = 0;
    const char *default_close_tag = NULL;
    int default_close_tag_length = 0;
    int n_args_without_option = nargs;

    if (end_arg->header.type == GRN_TABLE_HASH_KEY) {
      grn_obj *options = end_arg;
      grn_hash_cursor *cursor;
      void *key;
      int key_size;
      grn_obj *value;

      n_args_without_option--;
      cursor = grn_hash_cursor_open(ctx, (grn_hash *)options,
                                    NULL, 0, NULL, 0,
                                    0, -1, 0);
      if (!cursor) {
        GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                         "snippet(): couldn't open cursor");
        goto exit;
      }
      while (grn_hash_cursor_next(ctx, cursor) != GRN_ID_NIL) {
        grn_hash_cursor_get_key_value(ctx, cursor,
                                      &key, &key_size,
                                      (void **)&value);
        if (key_size == 5 && !memcmp(key, "width", 5)) {
          width = GRN_UINT32_VALUE(value);
        } else if (key_size == 13 && !memcmp(key, "max_n_results", 13)) {
          max_n_results = GRN_UINT32_VALUE(value);
        } else if (key_size == 19 && !memcmp(key, "skip_leading_spaces", 19)) {
          if (GRN_BOOL_VALUE(value) == GRN_FALSE) {
            flags &= ~GRN_SNIP_SKIP_LEADING_SPACES;
          }
        } else if (key_size == 11 && !memcmp(key, "html_escape", 11)) {
          if (GRN_BOOL_VALUE(value)) {
            mapping = GRN_SNIP_MAPPING_HTML_ESCAPE;
          }
        } else if (key_size == 6 && !memcmp(key, "prefix", 6)) {
          prefix = GRN_TEXT_VALUE(value);
          prefix_length = GRN_TEXT_LEN(value);
        } else if (key_size == 6 && !memcmp(key, "suffix", 6)) {
          suffix = GRN_TEXT_VALUE(value);
          suffix_length = GRN_TEXT_LEN(value);
        } else if (key_size == 10 && !memcmp(key, "normalizer", 10)) {
          normalizer_name = GRN_TEXT_VALUE(value);
          normalizer_name_length = GRN_TEXT_LEN(value);
        } else if (key_size == 16 && !memcmp(key, "default_open_tag", 16)) {
          default_open_tag = GRN_TEXT_VALUE(value);
          default_open_tag_length = GRN_TEXT_LEN(value);
        } else if (key_size == 17 && !memcmp(key, "default_close_tag", 17)) {
          default_close_tag = GRN_TEXT_VALUE(value);
          default_close_tag_length = GRN_TEXT_LEN(value);
        } else {
          GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                           "invalid option name: <%.*s>",
                           key_size, (char *)key);
          grn_hash_cursor_close(ctx, cursor);
          goto exit;
        }
      }
      grn_hash_cursor_close(ctx, cursor);
    }

    snip = grn_snip_open(ctx, flags, width, max_n_results,
                         default_open_tag, default_open_tag_length,
                         default_close_tag, default_close_tag_length, mapping);
    if (snip) {
      grn_rc rc;
      unsigned int i;
      if (!normalizer_name) {
        grn_snip_set_normalizer(ctx, snip, GRN_NORMALIZER_AUTO);
      } else if (normalizer_name_length > 0) {
        grn_obj *normalizer;
        normalizer = grn_ctx_get(ctx, normalizer_name, normalizer_name_length);
        if (!grn_obj_is_normalizer_proc(ctx, normalizer)) {
          grn_obj inspected;
          GRN_TEXT_INIT(&inspected, 0);
          grn_inspect(ctx, &inspected, normalizer);
          GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                           "snippet(): not normalizer: <%.*s>",
                           (int)GRN_TEXT_LEN(&inspected),
                           GRN_TEXT_VALUE(&inspected));
          GRN_OBJ_FIN(ctx, &inspected);
          grn_obj_unlink(ctx, normalizer);
          goto exit;
        }
        grn_snip_set_normalizer(ctx, snip, normalizer);
        grn_obj_unlink(ctx, normalizer);
      }
      if (default_open_tag_length == 0 && default_close_tag_length == 0) {
        unsigned int n_keyword_sets =
          (n_args_without_option - N_REQUIRED_ARGS) / KEYWORD_SET_SIZE;
        grn_obj **keyword_set_args = args + N_REQUIRED_ARGS;
        for (i = 0; i < n_keyword_sets; i++) {
          rc = grn_snip_add_cond(ctx, snip,
                                 GRN_TEXT_VALUE(keyword_set_args[i * KEYWORD_SET_SIZE]),
                                 GRN_TEXT_LEN(keyword_set_args[i * KEYWORD_SET_SIZE]),
                                 GRN_TEXT_VALUE(keyword_set_args[i * KEYWORD_SET_SIZE + 1]),
                                 GRN_TEXT_LEN(keyword_set_args[i * KEYWORD_SET_SIZE + 1]),
                                 GRN_TEXT_VALUE(keyword_set_args[i * KEYWORD_SET_SIZE + 2]),
                                 GRN_TEXT_LEN(keyword_set_args[i * KEYWORD_SET_SIZE + 2]));
        }
      } else {
        unsigned int n_keywords = n_args_without_option - N_REQUIRED_ARGS;
        grn_obj **keyword_args = args + N_REQUIRED_ARGS;
        for (i = 0; i < n_keywords; i++) {
          rc = grn_snip_add_cond(ctx, snip,
                                 GRN_TEXT_VALUE(keyword_args[i]),
                                 GRN_TEXT_LEN(keyword_args[i]),
                                 NULL, 0,
                                 NULL, 0);
        }
      }
      snippets = snippet_exec(ctx, snip, text, user_data,
                              prefix, prefix_length,
                              suffix, suffix_length);
    }
  }
#undef KEYWORD_SET_SIZE
#undef N_REQUIRED_ARGS

exit :
  if (!snippets) {
    snippets = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_VOID, 0);
  }

  return snippets;
}
コード例 #29
0
ファイル: time.c プロジェクト: digideskio/groonga
static grn_obj *
func_time_classify_raw(grn_ctx *ctx,
                       int n_args,
                       grn_obj **args,
                       grn_user_data *user_data,
                       const char *function_name,
                       grn_time_classify_unit unit)
{
  grn_obj *time;
  uint32_t interval_raw = 1;
  grn_obj *classed_time;
  grn_bool accept_interval = GRN_TRUE;

  switch (unit) {
  case GRN_TIME_CLASSIFY_UNIT_SECOND :
  case GRN_TIME_CLASSIFY_UNIT_MINUTE :
  case GRN_TIME_CLASSIFY_UNIT_HOUR :
    accept_interval = GRN_TRUE;
    break;
  case GRN_TIME_CLASSIFY_UNIT_DAY :
  case GRN_TIME_CLASSIFY_UNIT_WEEK :
    accept_interval = GRN_FALSE;
    break;
  case GRN_TIME_CLASSIFY_UNIT_MONTH :
  case GRN_TIME_CLASSIFY_UNIT_YEAR :
    accept_interval = GRN_TRUE;
    break;
  }

  if (accept_interval) {
    if (!(n_args == 1 || n_args == 2)) {
      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                       "%s(): "
                       "wrong number of arguments (%d for 1..2)",
                       function_name,
                       n_args);
      return NULL;
    }
  } else {
    if (n_args != 1) {
      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                       "%s(): "
                       "wrong number of arguments (%d for 1)",
                       function_name,
                       n_args);
      return NULL;
    }
  }

  time = args[0];
  if (!(time->header.type == GRN_BULK &&
        time->header.domain == GRN_DB_TIME)) {
    grn_obj inspected;

    GRN_TEXT_INIT(&inspected, 0);
    grn_inspect(ctx, &inspected, time);
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                     "%s(): "
                     "the first argument must be a time: "
                     "<%.*s>",
                     function_name,
                     (int)GRN_TEXT_LEN(&inspected),
                     GRN_TEXT_VALUE(&inspected));
    GRN_OBJ_FIN(ctx, &inspected);
    return NULL;
  }

  if (n_args == 2) {
    grn_obj *interval;
    grn_obj casted_interval;

    interval = args[1];
    if (!(interval->header.type == GRN_BULK &&
          grn_type_id_is_number_family(ctx, interval->header.domain))) {
      grn_obj inspected;

      GRN_TEXT_INIT(&inspected, 0);
      grn_inspect(ctx, &inspected, interval);
      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                       "%s(): "
                       "the second argument must be a number: "
                       "<%.*s>",
                       function_name,
                       (int)GRN_TEXT_LEN(&inspected),
                       GRN_TEXT_VALUE(&inspected));
      GRN_OBJ_FIN(ctx, &inspected);
      return NULL;
    }

    GRN_VALUE_FIX_SIZE_INIT(&casted_interval, 0, GRN_DB_UINT32);
    grn_obj_cast(ctx, interval, &casted_interval, GRN_FALSE);
    interval_raw = GRN_UINT32_VALUE(&casted_interval);
    GRN_OBJ_FIN(ctx, &casted_interval);
  }

  {
    int64_t time_raw;
    struct tm tm;
    int64_t classed_time_raw;

    time_raw = GRN_TIME_VALUE(time);
    if (!grn_time_to_tm(ctx, time_raw, &tm)) {
      return NULL;
    }

    switch (unit) {
    case GRN_TIME_CLASSIFY_UNIT_SECOND :
      tm.tm_sec = (tm.tm_sec / interval_raw) * interval_raw;
      break;
    case GRN_TIME_CLASSIFY_UNIT_MINUTE :
      tm.tm_min = (tm.tm_min / interval_raw) * interval_raw;
      tm.tm_sec = 0;
      break;
    case GRN_TIME_CLASSIFY_UNIT_HOUR :
      tm.tm_hour = (tm.tm_hour / interval_raw) * interval_raw;
      tm.tm_min = 0;
      tm.tm_sec = 0;
      break;
    case GRN_TIME_CLASSIFY_UNIT_DAY :
      tm.tm_hour = 0;
      tm.tm_min = 0;
      tm.tm_sec = 0;
      break;
    case GRN_TIME_CLASSIFY_UNIT_WEEK :
      if ((tm.tm_mday - tm.tm_wday) >= 0) {
        tm.tm_mday -= tm.tm_wday;
      } else {
        int n_underflowed_mday = -(tm.tm_mday - tm.tm_wday);
        int mday;
        int max_mday = 31;

        if (tm.tm_mon == 0) {
          tm.tm_year--;
          tm.tm_mon = 11;
        } else {
          tm.tm_mon--;
        }

        for (mday = max_mday; mday > n_underflowed_mday; mday--) {
          int64_t unused;
          tm.tm_mday = mday;
          if (grn_time_from_tm(ctx, &unused, &tm)) {
            break;
          }
        }
        tm.tm_mday -= n_underflowed_mday;
      }
      tm.tm_hour = 0;
      tm.tm_min = 0;
      tm.tm_sec = 0;
      break;
    case GRN_TIME_CLASSIFY_UNIT_MONTH :
      tm.tm_mon = (tm.tm_mon / interval_raw) * interval_raw;
      tm.tm_mday = 1;
      tm.tm_hour = 0;
      tm.tm_min = 0;
      tm.tm_sec = 0;
      break;
    case GRN_TIME_CLASSIFY_UNIT_YEAR :
      tm.tm_year = (((1900 + tm.tm_year) / interval_raw) * interval_raw) - 1900;
      tm.tm_mon = 0;
      tm.tm_mday = 1;
      tm.tm_hour = 0;
      tm.tm_min = 0;
      tm.tm_sec = 0;
      break;
    }

    if (!grn_time_from_tm(ctx, &classed_time_raw, &tm)) {
      return NULL;
    }

    classed_time = grn_plugin_proc_alloc(ctx,
                                         user_data,
                                         time->header.domain,
                                         0);
    if (!classed_time) {
      return NULL;
    }
    GRN_TIME_SET(ctx, classed_time, classed_time_raw);

    return classed_time;
  }
}
コード例 #30
0
ファイル: mecab.c プロジェクト: asmlib/mariadb-server
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_mecab_tokenizer *tokenizer;
  unsigned int normalizer_flags = 0;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
  if (!query) {
    return NULL;
  }
  if (!sole_mecab) {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_new2() failed on mecab_init(): %s",
                         mecab_global_error_message());
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
  }
  if (!sole_mecab) {
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }

  if (query->encoding != sole_mecab_encoding) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "MeCab dictionary charset (%s) does not match "
                     "the table encoding: <%s>",
                     grn_encoding_to_string(sole_mecab_encoding),
                     grn_encoding_to_string(query->encoding));
    return NULL;
  }

  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][mecab] "
                     "memory allocation to grn_mecab_tokenizer failed");
    return NULL;
  }
  tokenizer->mecab = sole_mecab;
  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  GRN_TEXT_INIT(&(tokenizer->buf), 0);
  if (query->have_tokenized_delimiter) {
    tokenizer->next = normalized_string;
    tokenizer->end = tokenizer->next + normalized_string_length;
  } else if (normalized_string_length == 0) {
    tokenizer->next = "";
    tokenizer->end = tokenizer->next;
  } else {
    grn_bool succeeded;
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (grn_mecab_chunked_tokenize_enabled &&
        ctx->encoding == GRN_ENC_UTF8) {
      succeeded = chunked_tokenize_utf8(ctx,
                                        tokenizer,
                                        normalized_string,
                                        normalized_string_length);
    } else {
      const char *s;
      s = mecab_sparse_tostr2(tokenizer->mecab,
                              normalized_string,
                              normalized_string_length);
      if (!s) {
        succeeded = GRN_FALSE;
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_sparse_tostr() failed len=%d err=%s",
                         normalized_string_length,
                         mecab_strerror(tokenizer->mecab));
      } else {
        succeeded = GRN_TRUE;
        GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
    if (!succeeded) {
      grn_tokenizer_query_close(ctx, tokenizer->query);
      GRN_PLUGIN_FREE(ctx, tokenizer);
      return NULL;
    }
    {
      char *buf, *p;
      unsigned int bufsize;

      buf = GRN_TEXT_VALUE(&(tokenizer->buf));
      bufsize = GRN_TEXT_LEN(&(tokenizer->buf));
      /* A certain version of mecab returns trailing lf or spaces. */
      for (p = buf + bufsize - 2;
           buf <= p && isspace(*(unsigned char *)p);
           p--) { *p = '\0'; }
      tokenizer->next = buf;
      tokenizer->end = p + 1;
    }
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}