예제 #1
0
파일: tokenizer.c 프로젝트: groonga/groonga
static void
grn_tokenizer_query_ensure_normalized(grn_ctx *ctx, grn_tokenizer_query *query)
{
  if (!query->need_normalize) {
    return;
  }

  query->need_normalize = GRN_FALSE;

  if (query->normalized_query) {
    grn_obj_close(ctx, query->normalized_query);
  }
  query->normalized_query = grn_string_open_(ctx,
                                             query->ptr,
                                             query->length,
                                             query->lexicon,
                                             query->normalize_flags,
                                             query->encoding);
  if (!query->normalized_query) {
    query->have_tokenized_delimiter = GRN_FALSE;
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][normalize] failed to open normalized string");
    return;
  }

  query->need_delimiter_check = GRN_TRUE;
}
예제 #2
0
grn_tokenizer_query *
grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
                         unsigned int normalize_flags)
{
  grn_obj *flags = grn_ctx_pop(ctx);
  grn_obj *query_str = grn_ctx_pop(ctx);
  grn_obj *tokenize_mode = grn_ctx_pop(ctx);

  if (query_str == NULL) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument");
    return NULL;
  }

  if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer");
    return NULL;
  }

  {
    grn_tokenizer_query * const query =
        GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query));
    if (query == NULL) {
      return NULL;
    }
    query->normalized_query = NULL;
    query->query_buf = NULL;
    if (flags) {
      query->flags = GRN_UINT32_VALUE(flags);
    } else {
      query->flags = 0;
    }
    if (tokenize_mode) {
      query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode);
    } else {
      query->tokenize_mode = GRN_TOKENIZE_ADD;
    }
    query->token_mode = query->tokenize_mode;

    {
      grn_obj * const table = args[0];
      grn_obj_flags table_flags;
      grn_encoding table_encoding;
      unsigned int query_length = GRN_TEXT_LEN(query_str);
      char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1);
      grn_obj *normalizer = NULL;

      if (query_buf == NULL) {
        GRN_PLUGIN_FREE(ctx, query);
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer] failed to duplicate query");
        return NULL;
      }
      grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL,
                         &normalizer, NULL);
      {
        grn_obj *normalized_query;
        if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
          normalizer = GRN_NORMALIZER_AUTO;
        }
        normalized_query = grn_string_open_(ctx,
                                            GRN_TEXT_VALUE(query_str),
                                            GRN_TEXT_LEN(query_str),
                                            normalizer,
                                            normalize_flags,
                                            table_encoding);
        if (!normalized_query) {
          GRN_PLUGIN_FREE(ctx, query_buf);
          GRN_PLUGIN_FREE(ctx, query);
          GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                           "[tokenizer] failed to open normalized string");
          return NULL;
        }
        query->normalized_query = normalized_query;
        grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length);
        query_buf[query_length] = '\0';
        query->query_buf = query_buf;
        query->ptr = query_buf;
        query->length = query_length;
      }
      query->encoding = table_encoding;

      if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) {
        const char *normalized_string;
        unsigned int normalized_string_length;

        grn_string_get_normalized(ctx,
                                  query->normalized_query,
                                  &normalized_string,
                                  &normalized_string_length,
                                  NULL);
        query->have_tokenized_delimiter =
          grn_tokenizer_have_tokenized_delimiter(ctx,
                                                 normalized_string,
                                                 normalized_string_length,
                                                 query->encoding);
      } else {
        query->have_tokenized_delimiter = GRN_FALSE;
      }
    }
    return query;
  }
}
예제 #3
0
파일: token.c 프로젝트: henbow/groonga
grn_token *
grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len,
               grn_token_mode mode, unsigned int flags)
{
  grn_token *token;
  grn_encoding encoding;
  grn_obj *tokenizer;
  grn_obj *normalizer;
  grn_obj_flags table_flags;
  if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer,
                         &normalizer)) {
    return NULL;
  }
  if (!(token = GRN_MALLOC(sizeof(grn_token)))) { return NULL; }
  token->table = table;
  token->mode = mode;
  token->encoding = encoding;
  token->tokenizer = tokenizer;
  token->orig = str;
  token->orig_blen = str_len;
  token->curr = NULL;
  token->nstr = NULL;
  token->curr_size = 0;
  token->pos = -1;
  token->status = GRN_TOKEN_DOING;
  token->force_prefix = 0;
  if (tokenizer) {
    grn_obj str_, flags_;
    GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY);
    GRN_TEXT_SET_REF(&str_, str, str_len);
    GRN_UINT32_INIT(&flags_, 0);
    GRN_UINT32_SET(ctx, &flags_, flags);
    token->pctx.caller = NULL;
    token->pctx.user_data.ptr = NULL;
    token->pctx.proc = (grn_proc *)tokenizer;
    token->pctx.hooks = NULL;
    token->pctx.currh = NULL;
    token->pctx.phase = PROC_INIT;
    grn_ctx_push(ctx, &str_);
    grn_ctx_push(ctx, &flags_);
    ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token->pctx.user_data);
    grn_obj_close(ctx, &flags_);
    grn_obj_close(ctx, &str_);
  } else {
    int nflags = 0;
    token->nstr = grn_string_open_(ctx, str, str_len,
                                   normalizer, nflags, token->encoding);
    if (token->nstr) {
      const char *normalized;
      grn_string_get_normalized(ctx, token->nstr,
                                &normalized, &(token->curr_size), NULL);
      token->curr = (const unsigned char *)normalized;
    } else {
      ERR(GRN_TOKENIZER_ERROR, "grn_string_open failed at grn_token_open");
    }
  }
  if (ctx->rc) {
    grn_token_close(ctx, token);
    token = NULL;
  }
  return token;
}
예제 #4
0
grn_obj *
grn_string_open(grn_ctx *ctx, const char *str, unsigned int str_len,
                grn_obj *normalizer, int flags)
{
  return grn_string_open_(ctx, str, str_len, normalizer, flags, ctx->encoding);
}