static void grn_tokenizer_query_ensure_have_tokenized_delimiter(grn_ctx *ctx, grn_tokenizer_query *query) { grn_tokenizer_query_ensure_normalized(ctx, query); if (!query->need_delimiter_check) { return; } query->need_delimiter_check = GRN_FALSE; if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) { const char *normalized_string; unsigned int normalized_string_length; grn_string_get_normalized(ctx, query->normalized_query, &normalized_string, &normalized_string_length, NULL); query->have_tokenized_delimiter = grn_tokenizer_have_tokenized_delimiter(ctx, normalized_string, normalized_string_length, query->encoding); } else { query->have_tokenized_delimiter = GRN_FALSE; } }
static grn_obj * delimited_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, const uint8_t *delimiter, uint32_t delimiter_len) { grn_tokenizer_query *query; unsigned int normalize_flags = 0; const char *normalized; unsigned int normalized_length_in_bytes; grn_delimited_tokenizer *tokenizer; query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags); if (!query) { return NULL; } if (!(tokenizer = GRN_MALLOC(sizeof(grn_delimited_tokenizer)))) { ERR(GRN_NO_MEMORY_AVAILABLE, "[tokenizer][delimit] " "memory allocation to grn_delimited_tokenizer failed"); grn_tokenizer_query_close(ctx, query); return NULL; } user_data->ptr = tokenizer; tokenizer->query = query; tokenizer->have_tokenized_delimiter = grn_tokenizer_have_tokenized_delimiter(ctx, tokenizer->query->ptr, tokenizer->query->length, tokenizer->query->encoding); tokenizer->delimiter = delimiter; tokenizer->delimiter_len = delimiter_len; grn_string_get_normalized(ctx, tokenizer->query->normalized_query, &normalized, &normalized_length_in_bytes, NULL); tokenizer->next = (const unsigned char *)normalized; tokenizer->end = tokenizer->next + normalized_length_in_bytes; grn_tokenizer_token_init(ctx, &(tokenizer->token)); return NULL; }
grn_tokenizer_query * grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args, unsigned int normalize_flags) { grn_obj *flags = grn_ctx_pop(ctx); grn_obj *query_str = grn_ctx_pop(ctx); grn_obj *tokenize_mode = grn_ctx_pop(ctx); if (query_str == NULL) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument"); return NULL; } if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer"); return NULL; } { grn_tokenizer_query * const query = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query)); if (query == NULL) { return NULL; } query->normalized_query = NULL; query->query_buf = NULL; if (flags) { query->flags = GRN_UINT32_VALUE(flags); } else { query->flags = 0; } if (tokenize_mode) { query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode); } else { query->tokenize_mode = GRN_TOKENIZE_ADD; } query->token_mode = query->tokenize_mode; { grn_obj * const table = args[0]; grn_obj_flags table_flags; grn_encoding table_encoding; unsigned int query_length = GRN_TEXT_LEN(query_str); char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1); grn_obj *normalizer = NULL; if (query_buf == NULL) { GRN_PLUGIN_FREE(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer] failed to duplicate query"); return NULL; } grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL, &normalizer, NULL); { grn_obj *normalized_query; if (table_flags & GRN_OBJ_KEY_NORMALIZE) { normalizer = GRN_NORMALIZER_AUTO; } normalized_query = grn_string_open_(ctx, GRN_TEXT_VALUE(query_str), GRN_TEXT_LEN(query_str), normalizer, normalize_flags, table_encoding); if (!normalized_query) { GRN_PLUGIN_FREE(ctx, query_buf); GRN_PLUGIN_FREE(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer] failed to open normalized string"); return NULL; } query->normalized_query = normalized_query; grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length); query_buf[query_length] = '\0'; query->query_buf = query_buf; query->ptr = query_buf; query->length = query_length; } query->encoding = table_encoding; if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) { const char *normalized_string; unsigned int normalized_string_length; grn_string_get_normalized(ctx, query->normalized_query, &normalized_string, &normalized_string_length, NULL); query->have_tokenized_delimiter = grn_tokenizer_have_tokenized_delimiter(ctx, normalized_string, normalized_string_length, query->encoding); } else { query->have_tokenized_delimiter = GRN_FALSE; } } return query; } }
/* This function is called for a full text search query or a document to be indexed. This means that both short/long strings are given. The return value of this function is ignored. When an error occurs in this function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). */ static grn_obj * mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { char *buf, *p; const char *s; grn_mecab_tokenizer *tokenizer; unsigned int bufsize; grn_tokenizer_query *query; grn_obj *normalized_query; const char *normalized_string; unsigned int normalized_string_length; query = grn_tokenizer_query_open(ctx, nargs, args); if (!query) { return NULL; } if (!sole_mecab) { grn_plugin_mutex_lock(ctx, sole_mecab_mutex); if (!sole_mecab) { sole_mecab = mecab_new2("-Owakati"); if (!sole_mecab) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_new2() failed on mecab_init(): %s", mecab_strerror(NULL)); } else { sole_mecab_encoding = get_mecab_encoding(sole_mecab); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); } if (!sole_mecab) { grn_tokenizer_query_close(ctx, query); return NULL; } if (query->encoding != sole_mecab_encoding) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "MeCab dictionary charset (%s) does not match " "the table encoding: <%s>", grn_enctostr(sole_mecab_encoding), grn_enctostr(query->encoding)); return NULL; } if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][mecab] " "memory allocation to grn_mecab_tokenizer failed"); return NULL; } tokenizer->mecab = sole_mecab; tokenizer->query = query; normalized_query = query->normalized_query; grn_string_get_normalized(ctx, normalized_query, &normalized_string, &normalized_string_length, NULL); tokenizer->have_tokenized_delimiter = grn_tokenizer_have_tokenized_delimiter(ctx, normalized_string, normalized_string_length, query->encoding); if (tokenizer->have_tokenized_delimiter) { tokenizer->buf = NULL; tokenizer->next = normalized_string; tokenizer->end = tokenizer->next + normalized_string_length; } else { grn_plugin_mutex_lock(ctx, sole_mecab_mutex); s = mecab_sparse_tostr2(tokenizer->mecab, normalized_string, normalized_string_length); if (!s) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_sparse_tostr() failed len=%d err=%s", normalized_string_length, mecab_strerror(tokenizer->mecab)); } else { bufsize = strlen(s) + 1; if (!(buf = GRN_PLUGIN_MALLOC(ctx, bufsize))) { GRN_PLUGIN_LOG(ctx, GRN_LOG_ALERT, "[tokenizer][mecab] " "buffer allocation on mecab_init failed !"); } else { memcpy(buf, s, bufsize); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); if (!s || !buf) { grn_tokenizer_query_close(ctx, tokenizer->query); GRN_PLUGIN_FREE(ctx, tokenizer); return NULL; } /* A certain version of mecab returns trailing lf or spaces. */ for (p = buf + bufsize - 2; buf <= p && isspace(*(unsigned char *)p); p--) { *p = '\0'; } tokenizer->buf = buf; tokenizer->next = buf; tokenizer->end = p + 1; } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); return NULL; }