static void grn_tokenizer_query_ensure_normalized(grn_ctx *ctx, grn_tokenizer_query *query) { if (!query->need_normalize) { return; } query->need_normalize = GRN_FALSE; if (query->normalized_query) { grn_obj_close(ctx, query->normalized_query); } query->normalized_query = grn_string_open_(ctx, query->ptr, query->length, query->lexicon, query->normalize_flags, query->encoding); if (!query->normalized_query) { query->have_tokenized_delimiter = GRN_FALSE; GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][normalize] failed to open normalized string"); return; } query->need_delimiter_check = GRN_TRUE; }
grn_tokenizer_query * grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args, unsigned int normalize_flags) { grn_obj *flags = grn_ctx_pop(ctx); grn_obj *query_str = grn_ctx_pop(ctx); grn_obj *tokenize_mode = grn_ctx_pop(ctx); if (query_str == NULL) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument"); return NULL; } if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer"); return NULL; } { grn_tokenizer_query * const query = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query)); if (query == NULL) { return NULL; } query->normalized_query = NULL; query->query_buf = NULL; if (flags) { query->flags = GRN_UINT32_VALUE(flags); } else { query->flags = 0; } if (tokenize_mode) { query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode); } else { query->tokenize_mode = GRN_TOKENIZE_ADD; } query->token_mode = query->tokenize_mode; { grn_obj * const table = args[0]; grn_obj_flags table_flags; grn_encoding table_encoding; unsigned int query_length = GRN_TEXT_LEN(query_str); char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1); grn_obj *normalizer = NULL; if (query_buf == NULL) { GRN_PLUGIN_FREE(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer] failed to duplicate query"); return NULL; } grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL, &normalizer, NULL); { grn_obj *normalized_query; if (table_flags & GRN_OBJ_KEY_NORMALIZE) { normalizer = GRN_NORMALIZER_AUTO; } normalized_query = grn_string_open_(ctx, GRN_TEXT_VALUE(query_str), GRN_TEXT_LEN(query_str), normalizer, normalize_flags, table_encoding); if (!normalized_query) { GRN_PLUGIN_FREE(ctx, query_buf); GRN_PLUGIN_FREE(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer] failed to open normalized string"); return NULL; } query->normalized_query = normalized_query; grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length); query_buf[query_length] = '\0'; query->query_buf = query_buf; query->ptr = query_buf; query->length = query_length; } query->encoding = table_encoding; if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) { const char *normalized_string; unsigned int normalized_string_length; grn_string_get_normalized(ctx, query->normalized_query, &normalized_string, &normalized_string_length, NULL); query->have_tokenized_delimiter = grn_tokenizer_have_tokenized_delimiter(ctx, normalized_string, normalized_string_length, query->encoding); } else { query->have_tokenized_delimiter = GRN_FALSE; } } return query; } }
grn_token * grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len, grn_token_mode mode, unsigned int flags) { grn_token *token; grn_encoding encoding; grn_obj *tokenizer; grn_obj *normalizer; grn_obj_flags table_flags; if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer, &normalizer)) { return NULL; } if (!(token = GRN_MALLOC(sizeof(grn_token)))) { return NULL; } token->table = table; token->mode = mode; token->encoding = encoding; token->tokenizer = tokenizer; token->orig = str; token->orig_blen = str_len; token->curr = NULL; token->nstr = NULL; token->curr_size = 0; token->pos = -1; token->status = GRN_TOKEN_DOING; token->force_prefix = 0; if (tokenizer) { grn_obj str_, flags_; GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY); GRN_TEXT_SET_REF(&str_, str, str_len); GRN_UINT32_INIT(&flags_, 0); GRN_UINT32_SET(ctx, &flags_, flags); token->pctx.caller = NULL; token->pctx.user_data.ptr = NULL; token->pctx.proc = (grn_proc *)tokenizer; token->pctx.hooks = NULL; token->pctx.currh = NULL; token->pctx.phase = PROC_INIT; grn_ctx_push(ctx, &str_); grn_ctx_push(ctx, &flags_); ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token->pctx.user_data); grn_obj_close(ctx, &flags_); grn_obj_close(ctx, &str_); } else { int nflags = 0; token->nstr = grn_string_open_(ctx, str, str_len, normalizer, nflags, token->encoding); if (token->nstr) { const char *normalized; grn_string_get_normalized(ctx, token->nstr, &normalized, &(token->curr_size), NULL); token->curr = (const unsigned char *)normalized; } else { ERR(GRN_TOKENIZER_ERROR, "grn_string_open failed at grn_token_open"); } } if (ctx->rc) { grn_token_close(ctx, token); token = NULL; } return token; }
grn_obj * grn_string_open(grn_ctx *ctx, const char *str, unsigned int str_len, grn_obj *normalizer, int flags) { return grn_string_open_(ctx, str, str_len, normalizer, flags, ctx->encoding); }