static void check_mecab_dictionary_encoding(grn_ctx *ctx) { #ifdef HAVE_MECAB_DICTIONARY_INFO_T mecab_t *mecab; mecab = mecab_new2("-Owakati"); if (mecab) { grn_encoding encoding; grn_bool have_same_encoding_dictionary; encoding = GRN_CTX_GET_ENCODING(ctx); have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab)); mecab_destroy(mecab); if (!have_same_encoding_dictionary) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "MeCab has no dictionary that uses the context encoding" ": <%s>", grn_encoding_to_string(encoding)); } } else { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_new2 failed in check_mecab_dictionary_encoding: %s", mecab_global_error_message()); } #endif }
grn_tokenizer_query * grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args, unsigned int normalize_flags) { grn_obj *flags; grn_obj *query_str; grn_obj *tokenize_mode; GRN_API_ENTER; flags = grn_ctx_pop(ctx); query_str = grn_ctx_pop(ctx); tokenize_mode = grn_ctx_pop(ctx); if (query_str == NULL) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument"); GRN_API_RETURN(NULL); } if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer"); GRN_API_RETURN(NULL); } { grn_tokenizer_query * const query = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query)); if (!query) { GRN_API_RETURN(NULL); } grn_tokenizer_query_init(ctx, query); grn_tokenizer_query_set_raw_string(ctx, query, GRN_TEXT_VALUE(query_str), GRN_TEXT_LEN(query_str)); if (ctx->rc != GRN_SUCCESS) { GRN_PLUGIN_FREE(ctx, query); GRN_API_RETURN(NULL); } if (flags) { grn_tokenizer_query_set_flags(ctx, query, GRN_UINT32_VALUE(flags)); } if (tokenize_mode) { grn_tokenizer_query_set_mode(ctx, query, GRN_UINT32_VALUE(tokenize_mode)); } grn_tokenizer_query_set_normalize_flags(ctx, query, normalize_flags); grn_tokenizer_query_set_lexicon(ctx, query, args[0]); grn_tokenizer_query_ensure_have_tokenized_delimiter(ctx, query); GRN_API_RETURN(query); } }
static void stem_filter(grn_ctx *ctx, grn_token *current_token, grn_token *next_token, void *user_data) { grn_stem_token_filter *token_filter = user_data; grn_obj *data; if (GRN_CTX_GET_ENCODING(ctx) != GRN_ENC_UTF8) { return; } data = grn_token_get_data(ctx, current_token); if (token_filter->stemmer) { sb_stemmer_delete(token_filter->stemmer); } { /* TODO: Detect algorithm from the current token. */ const char *algorithm = "english"; const char *encoding = "UTF_8"; token_filter->stemmer = sb_stemmer_new(algorithm, encoding); if (!token_filter->stemmer) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[token-filter][stem] " "failed to create stemmer: " "algorithm=<%s>, encoding=<%s>", algorithm, encoding); return; } } { const sb_symbol *stemmed; stemmed = sb_stemmer_stem(token_filter->stemmer, GRN_TEXT_VALUE(data), GRN_TEXT_LEN(data)); if (stemmed) { grn_token_set_data(ctx, next_token, stemmed, sb_stemmer_length(token_filter->stemmer)); } else { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[token-filter][stem] " "failed to allocate memory for stemmed word: <%.*s>", (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data)); return; } } }
static grn_bool chunked_tokenize_utf8_chunk(grn_ctx *ctx, grn_mecab_tokenizer *tokenizer, const char *chunk, unsigned int chunk_bytes) { const char *tokenized_chunk; size_t tokenized_chunk_length; tokenized_chunk = mecab_sparse_tostr2(tokenizer->mecab, chunk, chunk_bytes); if (!tokenized_chunk) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab][chunk] " "mecab_sparse_tostr2() failed len=%d err=%s", chunk_bytes, mecab_strerror(tokenizer->mecab)); return GRN_FALSE; } if (GRN_TEXT_LEN(&(tokenizer->buf)) > 0) { GRN_TEXT_PUTS(ctx, &(tokenizer->buf), " "); } tokenized_chunk_length = strlen(tokenized_chunk); if (tokenized_chunk_length >= 1 && isspace(tokenized_chunk[tokenized_chunk_length - 1])) { GRN_TEXT_PUT(ctx, &(tokenizer->buf), tokenized_chunk, tokenized_chunk_length - 1); } else { GRN_TEXT_PUT(ctx, &(tokenizer->buf), tokenized_chunk, tokenized_chunk_length); } return GRN_TRUE; }
grn_rc grn_token_filter_register(grn_ctx *ctx, const char *plugin_name_ptr, int plugin_name_length, grn_token_filter_init_func *init, grn_token_filter_filter_func *filter, grn_token_filter_fin_func *fin) { if (plugin_name_length == -1) { plugin_name_length = strlen(plugin_name_ptr); } { grn_obj *token_filter_object = grn_proc_create(ctx, plugin_name_ptr, plugin_name_length, GRN_PROC_TOKENIZER, NULL, NULL, NULL, 0, NULL); if (token_filter_object == NULL) { GRN_PLUGIN_ERROR(ctx, GRN_TOKEN_FILTER_ERROR, "[token-filter][%.*s] failed to grn_proc_create()", plugin_name_length, plugin_name_ptr); return ctx->rc; } { grn_proc *token_filter = (grn_proc *)token_filter_object; token_filter->callbacks.token_filter.init = init; token_filter->callbacks.token_filter.filter = filter; token_filter->callbacks.token_filter.fin = fin; } } return GRN_SUCCESS; }
grn_rc grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr, unsigned int plugin_name_length, grn_proc_func *init, grn_proc_func *next, grn_proc_func *fin) { grn_expr_var vars[] = { { NULL, 0 }, { NULL, 0 }, { NULL, 0 } }; GRN_TEXT_INIT(&vars[0].value, 0); GRN_TEXT_INIT(&vars[1].value, 0); GRN_UINT32_INIT(&vars[2].value, 0); { /* grn_proc_create() registers a plugin to the database which is associated with `ctx'. A returned object must not be finalized here. */ grn_obj * const obj = grn_proc_create(ctx, plugin_name_ptr, plugin_name_length, GRN_PROC_TOKENIZER, init, next, fin, 3, vars); if (obj == NULL) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "grn_proc_create() failed"); return ctx->rc; } } return GRN_SUCCESS; }
static void grn_tokenizer_query_ensure_normalized(grn_ctx *ctx, grn_tokenizer_query *query) { if (!query->need_normalize) { return; } query->need_normalize = GRN_FALSE; if (query->normalized_query) { grn_obj_close(ctx, query->normalized_query); } query->normalized_query = grn_string_open_(ctx, query->ptr, query->length, query->lexicon, query->normalize_flags, query->encoding); if (!query->normalized_query) { query->have_tokenized_delimiter = GRN_FALSE; GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][normalize] failed to open normalized string"); return; } query->need_delimiter_check = GRN_TRUE; }
grn_obj * grn_tokenizer_create(grn_ctx *ctx, const char *name, int name_length) { grn_obj *tokenizer; GRN_API_ENTER; tokenizer = grn_proc_create(ctx, name, name_length, GRN_PROC_TOKENIZER, NULL, NULL, NULL, 0, NULL); if (!tokenizer) { if (name_length < 0) { name_length = strlen(name); } GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][create] failed to create: <%.*s>", name_length, name); } GRN_API_RETURN(tokenizer); }
grn_rc grn_tokenizer_query_set_raw_string(grn_ctx *ctx, grn_tokenizer_query *query, const char *string, size_t string_length) { GRN_API_ENTER; if (query->query_buf) { GRN_PLUGIN_FREE(ctx, query->query_buf); } if (string_length == 0) { query->query_buf = NULL; query->ptr = NULL; query->length = 0; query->need_normalize = GRN_TRUE; } else { query->query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, string_length + 1); if (!query->query_buf) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][query] failed to duplicate query"); GRN_API_RETURN(ctx->rc); } grn_memcpy(query->query_buf, string, string_length); query->query_buf[string_length] = '\0'; query->ptr = query->query_buf; query->length = string_length; } GRN_API_RETURN(ctx->rc); }
static grn_obj * func_vector_size(grn_ctx *ctx, int n_args, grn_obj **args, grn_user_data *user_data) { grn_obj *target; unsigned int size; grn_obj *grn_size; if (n_args != 1) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "vector_size(): wrong number of arguments (%d for 1)", n_args); return NULL; } target = args[0]; switch (target->header.type) { case GRN_VECTOR : case GRN_PVECTOR : case GRN_UVECTOR : size = grn_vector_size(ctx, target); break; default : { grn_obj inspected; GRN_TEXT_INIT(&inspected, 0); grn_inspect(ctx, target, &inspected); GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "vector_size(): target object must be vector: <%.*s>", (int)GRN_TEXT_LEN(&inspected), GRN_TEXT_VALUE(&inspected)); GRN_OBJ_FIN(ctx, &inspected); return NULL; } break; } grn_size = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_UINT32, 0); if (!grn_size) { return NULL; } GRN_UINT32_SET(ctx, grn_size, size); return grn_size; }
static grn_obj * func_highlight_html(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_obj *highlighted = NULL; grn_obj *string; grn_obj *lexicon = NULL; grn_obj *expression = NULL; grn_highlighter *highlighter; grn_obj *highlighter_ptr; if (!(1 <= nargs && nargs <= 2)) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "highlight_html(): wrong number of arguments (%d for 1..2)", nargs); highlighted = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_VOID, 0); return highlighted; } string = args[0]; if (nargs == 2) { lexicon = args[1]; } grn_proc_get_info(ctx, user_data, NULL, NULL, &expression); highlighter_ptr = grn_expr_get_var(ctx, expression, GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME, strlen(GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME)); if (highlighter_ptr) { highlighter = (grn_highlighter *)GRN_PTR_VALUE(highlighter_ptr); } else { highlighter_ptr = grn_expr_get_or_add_var(ctx, expression, GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME, strlen(GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME)); GRN_OBJ_FIN(ctx, highlighter_ptr); GRN_PTR_INIT(highlighter_ptr, GRN_OBJ_OWN, GRN_DB_OBJECT); highlighter = func_highlight_html_create_highlighter(ctx, expression); grn_highlighter_set_lexicon(ctx, highlighter, lexicon); GRN_PTR_SET(ctx, highlighter_ptr, highlighter); } highlighted = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_TEXT, 0); grn_highlighter_highlight(ctx, highlighter, GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string), highlighted); return highlighted; }
/* This function initializes a plugin. This function fails if there is no dictionary that uses the context encoding of groonga. */ grn_rc GRN_PLUGIN_INIT(grn_ctx *ctx) { sole_mecab = NULL; sole_mecab_mutex = grn_plugin_mutex_open(ctx); if (!sole_mecab_mutex) { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][mecab] grn_plugin_mutex_open() failed"); return ctx->rc; } check_mecab_dictionary_encoding(ctx); return ctx->rc; }
grn_rc grn_tokenizer_set_fin_func(grn_ctx *ctx, grn_obj *tokenizer, grn_tokenizer_fin_func *fin) { GRN_API_ENTER; if (tokenizer) { ((grn_proc *)tokenizer)->callbacks.tokenizer.fin = fin; } else { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenizer][fin][set] tokenizer is NULL"); } GRN_API_RETURN(ctx->rc); }
static void command_schema_column_output_indexes(grn_ctx *ctx, grn_obj *column) { uint32_t i; grn_index_datum *index_data = NULL; uint32_t n_index_data = 0; if (column) { n_index_data = grn_column_get_all_index_data(ctx, column, NULL, 0); if (n_index_data > 0) { index_data = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_index_datum) * n_index_data); if (!index_data) { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[schema] failed to allocate memory for indexes"); return; } grn_column_get_all_index_data(ctx, column, index_data, n_index_data); } } grn_ctx_output_array_open(ctx, "indexes", n_index_data); for (i = 0; i < n_index_data; i++) { grn_obj *lexicon; grn_ctx_output_map_open(ctx, "index", 4); grn_ctx_output_cstr(ctx, "full_name"); command_schema_output_name(ctx, index_data[i].index); grn_ctx_output_cstr(ctx, "table"); lexicon = grn_ctx_at(ctx, index_data[i].index->header.domain); command_schema_output_name(ctx, lexicon); grn_ctx_output_cstr(ctx, "name"); command_schema_output_column_name(ctx, index_data[i].index); grn_ctx_output_cstr(ctx, "section"); grn_ctx_output_uint64(ctx, index_data[i].section); grn_ctx_output_map_close(ctx); } grn_ctx_output_array_close(ctx); if (index_data) { GRN_PLUGIN_FREE(ctx, index_data); } }
static void * stem_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode) { grn_stem_token_filter *token_filter; token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stem_token_filter)); if (!token_filter) { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[token-filter][stem] " "failed to allocate grn_stem_token_filter"); return NULL; } token_filter->stemmer = NULL; grn_tokenizer_token_init(ctx, &(token_filter->token)); return token_filter; }
/* This function initializes a plugin. This function fails if there is no dictionary that uses the context encoding of groonga. */ grn_rc GRN_PLUGIN_INIT(grn_ctx *ctx) { { char env[GRN_ENV_BUFFER_SIZE]; grn_getenv("GRN_MECAB_CHUNKED_TOKENIZE_ENABLED", env, GRN_ENV_BUFFER_SIZE); grn_mecab_chunked_tokenize_enabled = (env[0] && strcmp(env, "yes") == 0); } { char env[GRN_ENV_BUFFER_SIZE]; grn_getenv("GRN_MECAB_CHUNK_SIZE_THRESHOLD", env, GRN_ENV_BUFFER_SIZE); if (env[0]) { int threshold = -1; const char *end; const char *rest; end = env + strlen(env); threshold = grn_atoi(env, end, &rest); if (end > env && end == rest) { grn_mecab_chunk_size_threshold = threshold; } } } sole_mecab = NULL; sole_mecab_mutex = grn_plugin_mutex_open(ctx); if (!sole_mecab_mutex) { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][mecab] grn_plugin_mutex_open() failed"); return ctx->rc; } check_mecab_dictionary_encoding(ctx); return ctx->rc; }
static grn_obj * sample_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { sample_tokenizer *tokenizer; unsigned int normalizer_flags = 0; grn_tokenizer_query *query; grn_obj *normalized_query; const char *normalized_string; unsigned int normalized_string_length; query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags); if (!query) { return NULL; } tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(sample_tokenizer)); if (!tokenizer) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][sample] " "memory allocation to sample_tokenizer failed"); return NULL; } tokenizer->query = query; normalized_query = query->normalized_query; grn_string_get_normalized(ctx, normalized_query, &normalized_string, &normalized_string_length, NULL); tokenizer->next = normalized_string; tokenizer->rest = normalized_string_length; user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); return NULL; }
static unsigned int parse_tokenize_flags(grn_ctx *ctx, grn_obj *flag_names) { unsigned int flags = 0; const char *names, *names_end; int length; names = GRN_TEXT_VALUE(flag_names); length = GRN_TEXT_LEN(flag_names); names_end = names + length; while (names < names_end) { if (*names == '|' || *names == ' ') { names += 1; continue; } #define CHECK_FLAG(name)\ if (((names_end - names) >= (sizeof(#name) - 1)) &&\ (!memcmp(names, #name, sizeof(#name) - 1))) {\ flags |= GRN_TOKEN_CURSOR_ ## name;\ names += sizeof(#name) - 1;\ continue;\ } CHECK_FLAG(ENABLE_TOKENIZED_DELIMITER); #define GRN_TOKEN_CURSOR_NONE 0 CHECK_FLAG(NONE); #undef GRN_TOKEN_CURSOR_NONE GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] invalid flag: <%.*s>", (int)(names_end - names), names); return 0; #undef CHECK_FLAG } return flags; }
grn_rc grn_command_register(grn_ctx *ctx, const char *command_name, int command_name_size, grn_command_run_func *run, grn_expr_var *vars, unsigned int n_vars, void *user_data) { GRN_API_ENTER; if (command_name_size == -1) { command_name_size = strlen(command_name); } { grn_obj *command_object; command_object = grn_proc_create(ctx, command_name, command_name_size, GRN_PROC_COMMAND, NULL, NULL, NULL, n_vars, vars); if (!command_object) { GRN_PLUGIN_ERROR(ctx, GRN_COMMAND_ERROR, "[command][%.*s] failed to grn_proc_create()", command_name_size, command_name); GRN_API_RETURN(ctx->rc); } { grn_proc *command = (grn_proc *)command_object; command->callbacks.command.run = run; command->user_data = user_data; } } GRN_API_RETURN(GRN_SUCCESS); }
static grn_obj * func_highlight_create_keywords_table(grn_ctx *ctx, grn_user_data *user_data, const char *normalizer_name, unsigned int normalizer_name_length) { grn_obj *keywords; keywords = grn_table_create(ctx, NULL, 0, NULL, GRN_OBJ_TABLE_PAT_KEY, grn_ctx_at(ctx, GRN_DB_SHORT_TEXT), NULL); if (normalizer_name_length > 0) { grn_obj *normalizer; normalizer = grn_ctx_get(ctx, normalizer_name, normalizer_name_length); if (!grn_obj_is_normalizer_proc(ctx, normalizer)) { grn_obj inspected; GRN_TEXT_INIT(&inspected, 0); grn_inspect(ctx, &inspected, normalizer); GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "highlight_full() not normalizer: <%.*s>", (int)GRN_TEXT_LEN(&inspected), GRN_TEXT_VALUE(&inspected)); GRN_OBJ_FIN(ctx, &inspected); grn_obj_unlink(ctx, normalizer); grn_obj_unlink(ctx, keywords); return NULL; } grn_obj_set_info(ctx, keywords, GRN_INFO_NORMALIZER, normalizer); grn_obj_unlink(ctx, normalizer); } return keywords; }
static grn_bool chunked_tokenize_utf8(grn_ctx *ctx, grn_mecab_tokenizer *tokenizer, const char *string, unsigned int string_bytes) { const char *chunk_start; const char *current; const char *last_delimiter; const char *string_end = string + string_bytes; grn_encoding encoding = tokenizer->query->encoding; if (string_bytes < grn_mecab_chunk_size_threshold) { return chunked_tokenize_utf8_chunk(ctx, tokenizer, string, string_bytes); } chunk_start = current = string; last_delimiter = NULL; while (current < string_end) { int space_bytes; int character_bytes; const char *current_character; space_bytes = grn_isspace(current, encoding); if (space_bytes > 0) { if (chunk_start != current) { grn_bool succeeded; succeeded = chunked_tokenize_utf8_chunk(ctx, tokenizer, chunk_start, current - chunk_start); if (!succeeded) { return succeeded; } } current += space_bytes; chunk_start = current; last_delimiter = NULL; continue; } character_bytes = grn_charlen_(ctx, current, string_end, encoding); if (character_bytes == 0) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab][chunk] " "invalid byte sequence: position=%d", (int)(current - string)); return GRN_FALSE; } current_character = current; current += character_bytes; if (is_delimiter_character(ctx, current_character, character_bytes)) { last_delimiter = current; } if ((current - chunk_start) >= grn_mecab_chunk_size_threshold) { grn_bool succeeded; if (last_delimiter) { succeeded = chunked_tokenize_utf8_chunk(ctx, tokenizer, chunk_start, last_delimiter - chunk_start); chunk_start = last_delimiter; } else { succeeded = chunked_tokenize_utf8_chunk(ctx, tokenizer, chunk_start, current - chunk_start); chunk_start = current; } if (!succeeded) { return succeeded; } last_delimiter = NULL; } } if (current == chunk_start) { return GRN_TRUE; } else { return chunked_tokenize_utf8_chunk(ctx, tokenizer, chunk_start, current - chunk_start); } }
static grn_rc selector_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *index, int nargs, grn_obj **args, grn_obj *res, grn_operator op) { grn_rc rc = GRN_SUCCESS; grn_obj *target = NULL; grn_obj *obj; grn_obj *query; uint32_t max_distance = 1; uint32_t prefix_length = 0; uint32_t prefix_match_size = 0; uint32_t max_expansion = 0; int flags = 0; grn_bool use_sequential_search = GRN_FALSE; if ((nargs - 1) < 2) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "fuzzy_search(): wrong number of arguments (%d ...)", nargs - 1); rc = ctx->rc; goto exit; } obj = args[1]; query = args[2]; if (nargs == 4) { grn_obj *options = args[3]; grn_hash_cursor *cursor; void *key; grn_obj *value; int key_size; if (options->header.type != GRN_TABLE_HASH_KEY) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "fuzzy_search(): " "3rd argument must be object literal: <%.*s>", (int)GRN_TEXT_LEN(options), GRN_TEXT_VALUE(options)); goto exit; } cursor = grn_hash_cursor_open(ctx, (grn_hash *)options, NULL, 0, NULL, 0, 0, -1, 0); if (!cursor) { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "fuzzy_search(): couldn't open cursor"); goto exit; } while (grn_hash_cursor_next(ctx, cursor) != GRN_ID_NIL) { grn_hash_cursor_get_key_value(ctx, cursor, &key, &key_size, (void **)&value); if (key_size == 12 && !memcmp(key, "max_distance", 12)) { max_distance = GRN_UINT32_VALUE(value); } else if (key_size == 13 && !memcmp(key, "prefix_length", 13)) { prefix_length = GRN_UINT32_VALUE(value); } else if (key_size == 13 && !memcmp(key, "max_expansion", 13)) { max_expansion = GRN_UINT32_VALUE(value); } else if (key_size == 18 && !memcmp(key, "with_transposition", 18)) { if (GRN_BOOL_VALUE(value)) { flags |= GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION; } } else { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid option name: <%.*s>", key_size, (char *)key); grn_hash_cursor_close(ctx, cursor); goto exit; } } grn_hash_cursor_close(ctx, cursor); } if (index) { target = index; } else { if (obj->header.type == GRN_COLUMN_INDEX) { target = obj; } else { grn_column_index(ctx, obj, GRN_OP_FUZZY, &target, 1, NULL); } } if (target) { grn_obj *lexicon; use_sequential_search = GRN_TRUE; lexicon = grn_ctx_at(ctx, target->header.domain); if (lexicon) { if (lexicon->header.type == GRN_TABLE_PAT_KEY) { use_sequential_search = GRN_FALSE; } grn_obj_unlink(ctx, lexicon); } } else { if (grn_obj_is_key_accessor(ctx, obj) && table->header.type == GRN_TABLE_PAT_KEY) { target = table; } else { use_sequential_search = GRN_TRUE; } } if (prefix_length) { const char *s = GRN_TEXT_VALUE(query); const char *e = GRN_BULK_CURR(query); const char *p; unsigned int cl = 0; unsigned int length = 0; for (p = s; p < e && (cl = grn_charlen(ctx, p, e)); p += cl) { length++; if (length > prefix_length) { break; } } prefix_match_size = p - s; } if (use_sequential_search) { rc = sequential_fuzzy_search(ctx, table, obj, query, max_distance, prefix_match_size, max_expansion, flags, res, op); goto exit; } if (!target) { grn_obj inspected; GRN_TEXT_INIT(&inspected, 0); grn_inspect(ctx, &inspected, target); GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "fuzzy_search(): " "column must be COLUMN_INDEX or TABLE_PAT_KEY: <%.*s>", (int)GRN_TEXT_LEN(&inspected), GRN_TEXT_VALUE(&inspected)); rc = ctx->rc; GRN_OBJ_FIN(ctx, &inspected); } else { grn_search_optarg options = {0}; options.mode = GRN_OP_FUZZY; options.fuzzy.prefix_match_size = prefix_match_size; options.fuzzy.max_distance = max_distance; options.fuzzy.max_expansion = max_expansion; options.fuzzy.flags = flags; grn_obj_search(ctx, target, query, res, op, &options); } exit : return rc; }
static grn_obj * func_highlight(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_obj *highlighted = NULL; #define N_REQUIRED_ARGS 1 if (nargs > N_REQUIRED_ARGS) { grn_obj *string = args[0]; grn_bool use_html_escape = GRN_FALSE; grn_obj *keywords; const char *normalizer_name = "NormalizerAuto"; unsigned int normalizer_name_length = 14; const char *default_open_tag = NULL; unsigned int default_open_tag_length = 0; const char *default_close_tag = NULL; unsigned int default_close_tag_length = 0; grn_obj *end_arg = args[nargs - 1]; int n_args_without_option = nargs; if (end_arg->header.type == GRN_TABLE_HASH_KEY) { grn_obj *options = end_arg; grn_hash_cursor *cursor; void *key; grn_obj *value; int key_size; n_args_without_option--; cursor = grn_hash_cursor_open(ctx, (grn_hash *)options, NULL, 0, NULL, 0, 0, -1, 0); if (!cursor) { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "highlight(): couldn't open cursor"); goto exit; } while (grn_hash_cursor_next(ctx, cursor) != GRN_ID_NIL) { grn_hash_cursor_get_key_value(ctx, cursor, &key, &key_size, (void **)&value); if (key_size == 10 && !memcmp(key, "normalizer", 10)) { normalizer_name = GRN_TEXT_VALUE(value); normalizer_name_length = GRN_TEXT_LEN(value); } else if (key_size == 11 && !memcmp(key, "html_escape", 11)) { if (GRN_BOOL_VALUE(value)) { use_html_escape = GRN_TRUE; } } else if (key_size == 16 && !memcmp(key, "default_open_tag", 16)) { default_open_tag = GRN_TEXT_VALUE(value); default_open_tag_length = GRN_TEXT_LEN(value); } else if (key_size == 17 && !memcmp(key, "default_close_tag", 17)) { default_close_tag = GRN_TEXT_VALUE(value); default_close_tag_length = GRN_TEXT_LEN(value); } else { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid option name: <%.*s>", key_size, (char *)key); grn_hash_cursor_close(ctx, cursor); goto exit; } } grn_hash_cursor_close(ctx, cursor); } keywords = func_highlight_create_keywords_table(ctx, user_data, normalizer_name, normalizer_name_length); if (keywords) { grn_obj **keyword_args = args + N_REQUIRED_ARGS; unsigned int n_keyword_args = n_args_without_option - N_REQUIRED_ARGS; if (default_open_tag_length == 0 && default_close_tag_length == 0) { highlighted = highlight_keyword_sets(ctx, user_data, keyword_args, n_keyword_args, string, keywords, use_html_escape); } else { unsigned int i; for (i = 0; i < n_keyword_args; i++) { grn_table_add(ctx, keywords, GRN_TEXT_VALUE(keyword_args[i]), GRN_TEXT_LEN(keyword_args[i]), NULL); } highlighted = highlight_keywords(ctx, user_data, string, keywords, use_html_escape, default_open_tag, default_open_tag_length, default_close_tag, default_close_tag_length); } } } #undef N_REQUIRED_ARGS exit : if (!highlighted) { highlighted = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_VOID, 0); } return highlighted; }
grn_tokenizer_query * grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args, unsigned int normalize_flags) { grn_obj *flags = grn_ctx_pop(ctx); grn_obj *query_str = grn_ctx_pop(ctx); grn_obj *tokenize_mode = grn_ctx_pop(ctx); if (query_str == NULL) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument"); return NULL; } if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer"); return NULL; } { grn_tokenizer_query * const query = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query)); if (query == NULL) { return NULL; } query->normalized_query = NULL; query->query_buf = NULL; if (flags) { query->flags = GRN_UINT32_VALUE(flags); } else { query->flags = 0; } if (tokenize_mode) { query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode); } else { query->tokenize_mode = GRN_TOKENIZE_ADD; } query->token_mode = query->tokenize_mode; { grn_obj * const table = args[0]; grn_obj_flags table_flags; grn_encoding table_encoding; unsigned int query_length = GRN_TEXT_LEN(query_str); char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1); grn_obj *normalizer = NULL; if (query_buf == NULL) { GRN_PLUGIN_FREE(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer] failed to duplicate query"); return NULL; } grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL, &normalizer, NULL); { grn_obj *normalized_query; if (table_flags & GRN_OBJ_KEY_NORMALIZE) { normalizer = GRN_NORMALIZER_AUTO; } normalized_query = grn_string_open_(ctx, GRN_TEXT_VALUE(query_str), GRN_TEXT_LEN(query_str), normalizer, normalize_flags, table_encoding); if (!normalized_query) { GRN_PLUGIN_FREE(ctx, query_buf); GRN_PLUGIN_FREE(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer] failed to open normalized string"); return NULL; } query->normalized_query = normalized_query; grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length); query_buf[query_length] = '\0'; query->query_buf = query_buf; query->ptr = query_buf; query->length = query_length; } query->encoding = table_encoding; if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) { const char *normalized_string; unsigned int normalized_string_length; grn_string_get_normalized(ctx, query->normalized_query, &normalized_string, &normalized_string_length, NULL); query->have_tokenized_delimiter = grn_tokenizer_have_tokenized_delimiter(ctx, normalized_string, normalized_string_length, query->encoding); } else { query->have_tokenized_delimiter = GRN_FALSE; } } return query; } }
static grn_obj * command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_obj *table_name; grn_obj *string; grn_obj *flag_names; grn_obj *mode_name; grn_obj *index_column_name; table_name = grn_plugin_proc_get_var(ctx, user_data, "table", -1); string = grn_plugin_proc_get_var(ctx, user_data, "string", -1); flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1); mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode", -1); index_column_name = grn_plugin_proc_get_var(ctx, user_data, "index_column", -1); if (GRN_TEXT_LEN(table_name) == 0) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] table name is missing"); return NULL; } if (GRN_TEXT_LEN(string) == 0) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] string is missing"); return NULL; } { unsigned int flags; grn_obj *lexicon; grn_obj *index_column = NULL; flags = parse_tokenize_flags(ctx, flag_names); if (ctx->rc != GRN_SUCCESS) { return NULL; } lexicon = grn_ctx_get(ctx, GRN_TEXT_VALUE(table_name), GRN_TEXT_LEN(table_name)); if (!lexicon) { return NULL; } #define MODE_NAME_EQUAL(name)\ (GRN_TEXT_LEN(mode_name) == strlen(name) &&\ memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0) if (GRN_TEXT_LEN(index_column_name) > 0) { index_column = grn_obj_column(ctx, lexicon, GRN_TEXT_VALUE(index_column_name), GRN_TEXT_LEN(index_column_name)); if (!index_column) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] nonexistent index column: <%.*s>", (int)GRN_TEXT_LEN(index_column_name), GRN_TEXT_VALUE(index_column_name)); goto exit; } if (index_column->header.type != GRN_COLUMN_INDEX) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] index column must be COLUMN_INDEX: <%.*s>", (int)GRN_TEXT_LEN(index_column_name), GRN_TEXT_VALUE(index_column_name)); goto exit; } } { grn_obj tokens; GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("GET")) { tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens); output_tokens(ctx, &tokens, lexicon, index_column); } else if (MODE_NAME_EQUAL("ADD")) { tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); output_tokens(ctx, &tokens, lexicon, index_column); } else { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] invalid mode: <%.*s>", (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name)); } GRN_OBJ_FIN(ctx, &tokens); } #undef MODE_NAME_EQUAL exit: grn_obj_unlink(ctx, lexicon); if (index_column) { grn_obj_unlink(ctx, index_column); } } return NULL; }
static grn_obj * yangram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, unsigned short ngram_unit, grn_bool ignore_blank, grn_bool split_symbol, grn_bool split_alpha, grn_bool split_digit, grn_bool skip_overlap, unsigned short use_vgram) { grn_tokenizer_query *query; unsigned int normalize_flags = GRN_STRING_WITH_TYPES | GRN_STRING_REMOVE_TOKENIZED_DELIMITER | GRN_STRING_REMOVE_BLANK; const char *normalized; unsigned int normalized_length_in_bytes; grn_yangram_tokenizer *tokenizer; query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags); if (!query) { return NULL; } if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_yangram_tokenizer)))) { GRN_PLUGIN_ERROR(ctx,GRN_NO_MEMORY_AVAILABLE, "[tokenizer][yangram] " "memory allocation to grn_yangram_tokenizer failed"); grn_tokenizer_query_close(ctx, query); return NULL; } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); tokenizer->query = query; tokenizer->skip_overlap = skip_overlap; tokenizer->ignore_blank = ignore_blank; tokenizer->ngram_unit = ngram_unit; tokenizer->split_symbol = split_symbol; tokenizer->split_alpha = split_alpha; tokenizer->split_digit = split_digit; tokenizer->use_vgram = use_vgram; if (tokenizer->use_vgram > 0) { const char *vgram_word_table_name_env; vgram_word_table_name_env = getenv("GRN_VGRAM_WORD_TABLE_NAME"); if (vgram_word_table_name_env) { tokenizer->vgram_table = grn_ctx_get(ctx, vgram_word_table_name_env, strlen(vgram_word_table_name_env)); } else { tokenizer->vgram_table = grn_ctx_get(ctx, VGRAM_WORD_TABLE_NAME, strlen(VGRAM_WORD_TABLE_NAME)); } if (!tokenizer->vgram_table) { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][yangram] " "couldn't open a vgram table"); tokenizer->vgram_table = NULL; return NULL; } } else { tokenizer->vgram_table = NULL; } grn_string_get_normalized(ctx, tokenizer->query->normalized_query, &normalized, &normalized_length_in_bytes, NULL); { const char *phrase_table_name_env; phrase_table_name_env = getenv("GRN_KNOWN_PHRASE_TABLE_NAME"); if (phrase_table_name_env) { tokenizer->phrase_table = grn_ctx_get(ctx, phrase_table_name_env, strlen(phrase_table_name_env)); } else { tokenizer->phrase_table = grn_ctx_get(ctx, KNOWN_PHRASE_TABLE_NAME, strlen(KNOWN_PHRASE_TABLE_NAME)); } if (tokenizer->phrase_table) { if (!(tokenizer->hits = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_pat_scan_hit) * MAX_N_HITS))) { GRN_PLUGIN_ERROR(ctx,GRN_NO_MEMORY_AVAILABLE, "[tokenizer][yangram] " "memory allocation to grn_pat_scan_hit failed"); grn_tokenizer_query_close(ctx, query); return NULL; } else { tokenizer->scan_rest = normalized; tokenizer->nhits = 0; tokenizer->current_hit = 0; } } else { tokenizer->phrase_table = NULL; } } tokenizer->next = (const unsigned char *)normalized; tokenizer->end = tokenizer->next + normalized_length_in_bytes; tokenizer->rest_length = tokenizer->end - tokenizer->next; tokenizer->ctypes = grn_string_get_types(ctx, tokenizer->query->normalized_query); tokenizer->pushed_token_tail = NULL; tokenizer->ctypes_next = 0; return NULL; }
static grn_obj * command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_obj *tokenizer_name; grn_obj *string; grn_obj *normalizer_name; grn_obj *flag_names; grn_obj *mode_name; grn_obj *token_filter_names; tokenizer_name = grn_plugin_proc_get_var(ctx, user_data, "tokenizer", -1); string = grn_plugin_proc_get_var(ctx, user_data, "string", -1); normalizer_name = grn_plugin_proc_get_var(ctx, user_data, "normalizer", -1); flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1); mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode", -1); token_filter_names = grn_plugin_proc_get_var(ctx, user_data, "token_filters", -1); if (GRN_TEXT_LEN(tokenizer_name) == 0) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] tokenizer name is missing"); return NULL; } if (GRN_TEXT_LEN(string) == 0) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] string is missing"); return NULL; } { unsigned int flags; grn_obj *lexicon; flags = parse_tokenize_flags(ctx, flag_names); if (ctx->rc != GRN_SUCCESS) { return NULL; } lexicon = create_lexicon_for_tokenize(ctx, tokenizer_name, normalizer_name, token_filter_names); if (!lexicon) { return NULL; } #define MODE_NAME_EQUAL(name)\ (GRN_TEXT_LEN(mode_name) == strlen(name) &&\ memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0) { grn_obj tokens; GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD")) { tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); output_tokens(ctx, &tokens, lexicon, NULL); } else if (MODE_NAME_EQUAL("GET")) { tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); GRN_BULK_REWIND(&tokens); tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens); output_tokens(ctx, &tokens, lexicon, NULL); } else { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] invalid mode: <%.*s>", (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name)); } GRN_OBJ_FIN(ctx, &tokens); } #undef MODE_NAME_EQUAL grn_obj_unlink(ctx, lexicon); } return NULL; }
/* TODO: support caching for the same parameter. */ static grn_obj * func_snippet(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_obj *snippets = NULL; #define N_REQUIRED_ARGS 1 #define KEYWORD_SET_SIZE 3 if (nargs > N_REQUIRED_ARGS) { grn_obj *text = args[0]; grn_obj *end_arg = args[nargs - 1]; grn_obj *snip = NULL; unsigned int width = 200; unsigned int max_n_results = 3; grn_snip_mapping *mapping = NULL; int flags = GRN_SNIP_SKIP_LEADING_SPACES; const char *prefix = NULL; int prefix_length = 0; const char *suffix = NULL; int suffix_length = 0; const char *normalizer_name = NULL; int normalizer_name_length = 0; const char *default_open_tag = NULL; int default_open_tag_length = 0; const char *default_close_tag = NULL; int default_close_tag_length = 0; int n_args_without_option = nargs; if (end_arg->header.type == GRN_TABLE_HASH_KEY) { grn_obj *options = end_arg; grn_hash_cursor *cursor; void *key; int key_size; grn_obj *value; n_args_without_option--; cursor = grn_hash_cursor_open(ctx, (grn_hash *)options, NULL, 0, NULL, 0, 0, -1, 0); if (!cursor) { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "snippet(): couldn't open cursor"); goto exit; } while (grn_hash_cursor_next(ctx, cursor) != GRN_ID_NIL) { grn_hash_cursor_get_key_value(ctx, cursor, &key, &key_size, (void **)&value); if (key_size == 5 && !memcmp(key, "width", 5)) { width = GRN_UINT32_VALUE(value); } else if (key_size == 13 && !memcmp(key, "max_n_results", 13)) { max_n_results = GRN_UINT32_VALUE(value); } else if (key_size == 19 && !memcmp(key, "skip_leading_spaces", 19)) { if (GRN_BOOL_VALUE(value) == GRN_FALSE) { flags &= ~GRN_SNIP_SKIP_LEADING_SPACES; } } else if (key_size == 11 && !memcmp(key, "html_escape", 11)) { if (GRN_BOOL_VALUE(value)) { mapping = GRN_SNIP_MAPPING_HTML_ESCAPE; } } else if (key_size == 6 && !memcmp(key, "prefix", 6)) { prefix = GRN_TEXT_VALUE(value); prefix_length = GRN_TEXT_LEN(value); } else if (key_size == 6 && !memcmp(key, "suffix", 6)) { suffix = GRN_TEXT_VALUE(value); suffix_length = GRN_TEXT_LEN(value); } else if (key_size == 10 && !memcmp(key, "normalizer", 10)) { normalizer_name = GRN_TEXT_VALUE(value); normalizer_name_length = GRN_TEXT_LEN(value); } else if (key_size == 16 && !memcmp(key, "default_open_tag", 16)) { default_open_tag = GRN_TEXT_VALUE(value); default_open_tag_length = GRN_TEXT_LEN(value); } else if (key_size == 17 && !memcmp(key, "default_close_tag", 17)) { default_close_tag = GRN_TEXT_VALUE(value); default_close_tag_length = GRN_TEXT_LEN(value); } else { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid option name: <%.*s>", key_size, (char *)key); grn_hash_cursor_close(ctx, cursor); goto exit; } } grn_hash_cursor_close(ctx, cursor); } snip = grn_snip_open(ctx, flags, width, max_n_results, default_open_tag, default_open_tag_length, default_close_tag, default_close_tag_length, mapping); if (snip) { grn_rc rc; unsigned int i; if (!normalizer_name) { grn_snip_set_normalizer(ctx, snip, GRN_NORMALIZER_AUTO); } else if (normalizer_name_length > 0) { grn_obj *normalizer; normalizer = grn_ctx_get(ctx, normalizer_name, normalizer_name_length); if (!grn_obj_is_normalizer_proc(ctx, normalizer)) { grn_obj inspected; GRN_TEXT_INIT(&inspected, 0); grn_inspect(ctx, &inspected, normalizer); GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "snippet(): not normalizer: <%.*s>", (int)GRN_TEXT_LEN(&inspected), GRN_TEXT_VALUE(&inspected)); GRN_OBJ_FIN(ctx, &inspected); grn_obj_unlink(ctx, normalizer); goto exit; } grn_snip_set_normalizer(ctx, snip, normalizer); grn_obj_unlink(ctx, normalizer); } if (default_open_tag_length == 0 && default_close_tag_length == 0) { unsigned int n_keyword_sets = (n_args_without_option - N_REQUIRED_ARGS) / KEYWORD_SET_SIZE; grn_obj **keyword_set_args = args + N_REQUIRED_ARGS; for (i = 0; i < n_keyword_sets; i++) { rc = grn_snip_add_cond(ctx, snip, GRN_TEXT_VALUE(keyword_set_args[i * KEYWORD_SET_SIZE]), GRN_TEXT_LEN(keyword_set_args[i * KEYWORD_SET_SIZE]), GRN_TEXT_VALUE(keyword_set_args[i * KEYWORD_SET_SIZE + 1]), GRN_TEXT_LEN(keyword_set_args[i * KEYWORD_SET_SIZE + 1]), GRN_TEXT_VALUE(keyword_set_args[i * KEYWORD_SET_SIZE + 2]), GRN_TEXT_LEN(keyword_set_args[i * KEYWORD_SET_SIZE + 2])); } } else { unsigned int n_keywords = n_args_without_option - N_REQUIRED_ARGS; grn_obj **keyword_args = args + N_REQUIRED_ARGS; for (i = 0; i < n_keywords; i++) { rc = grn_snip_add_cond(ctx, snip, GRN_TEXT_VALUE(keyword_args[i]), GRN_TEXT_LEN(keyword_args[i]), NULL, 0, NULL, 0); } } snippets = snippet_exec(ctx, snip, text, user_data, prefix, prefix_length, suffix, suffix_length); } } #undef KEYWORD_SET_SIZE #undef N_REQUIRED_ARGS exit : if (!snippets) { snippets = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_VOID, 0); } return snippets; }
static grn_obj * func_time_classify_raw(grn_ctx *ctx, int n_args, grn_obj **args, grn_user_data *user_data, const char *function_name, grn_time_classify_unit unit) { grn_obj *time; uint32_t interval_raw = 1; grn_obj *classed_time; grn_bool accept_interval = GRN_TRUE; switch (unit) { case GRN_TIME_CLASSIFY_UNIT_SECOND : case GRN_TIME_CLASSIFY_UNIT_MINUTE : case GRN_TIME_CLASSIFY_UNIT_HOUR : accept_interval = GRN_TRUE; break; case GRN_TIME_CLASSIFY_UNIT_DAY : case GRN_TIME_CLASSIFY_UNIT_WEEK : accept_interval = GRN_FALSE; break; case GRN_TIME_CLASSIFY_UNIT_MONTH : case GRN_TIME_CLASSIFY_UNIT_YEAR : accept_interval = GRN_TRUE; break; } if (accept_interval) { if (!(n_args == 1 || n_args == 2)) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "%s(): " "wrong number of arguments (%d for 1..2)", function_name, n_args); return NULL; } } else { if (n_args != 1) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "%s(): " "wrong number of arguments (%d for 1)", function_name, n_args); return NULL; } } time = args[0]; if (!(time->header.type == GRN_BULK && time->header.domain == GRN_DB_TIME)) { grn_obj inspected; GRN_TEXT_INIT(&inspected, 0); grn_inspect(ctx, &inspected, time); GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "%s(): " "the first argument must be a time: " "<%.*s>", function_name, (int)GRN_TEXT_LEN(&inspected), GRN_TEXT_VALUE(&inspected)); GRN_OBJ_FIN(ctx, &inspected); return NULL; } if (n_args == 2) { grn_obj *interval; grn_obj casted_interval; interval = args[1]; if (!(interval->header.type == GRN_BULK && grn_type_id_is_number_family(ctx, interval->header.domain))) { grn_obj inspected; GRN_TEXT_INIT(&inspected, 0); grn_inspect(ctx, &inspected, interval); GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "%s(): " "the second argument must be a number: " "<%.*s>", function_name, (int)GRN_TEXT_LEN(&inspected), GRN_TEXT_VALUE(&inspected)); GRN_OBJ_FIN(ctx, &inspected); return NULL; } GRN_VALUE_FIX_SIZE_INIT(&casted_interval, 0, GRN_DB_UINT32); grn_obj_cast(ctx, interval, &casted_interval, GRN_FALSE); interval_raw = GRN_UINT32_VALUE(&casted_interval); GRN_OBJ_FIN(ctx, &casted_interval); } { int64_t time_raw; struct tm tm; int64_t classed_time_raw; time_raw = GRN_TIME_VALUE(time); if (!grn_time_to_tm(ctx, time_raw, &tm)) { return NULL; } switch (unit) { case GRN_TIME_CLASSIFY_UNIT_SECOND : tm.tm_sec = (tm.tm_sec / interval_raw) * interval_raw; break; case GRN_TIME_CLASSIFY_UNIT_MINUTE : tm.tm_min = (tm.tm_min / interval_raw) * interval_raw; tm.tm_sec = 0; break; case GRN_TIME_CLASSIFY_UNIT_HOUR : tm.tm_hour = (tm.tm_hour / interval_raw) * interval_raw; tm.tm_min = 0; tm.tm_sec = 0; break; case GRN_TIME_CLASSIFY_UNIT_DAY : tm.tm_hour = 0; tm.tm_min = 0; tm.tm_sec = 0; break; case GRN_TIME_CLASSIFY_UNIT_WEEK : if ((tm.tm_mday - tm.tm_wday) >= 0) { tm.tm_mday -= tm.tm_wday; } else { int n_underflowed_mday = -(tm.tm_mday - tm.tm_wday); int mday; int max_mday = 31; if (tm.tm_mon == 0) { tm.tm_year--; tm.tm_mon = 11; } else { tm.tm_mon--; } for (mday = max_mday; mday > n_underflowed_mday; mday--) { int64_t unused; tm.tm_mday = mday; if (grn_time_from_tm(ctx, &unused, &tm)) { break; } } tm.tm_mday -= n_underflowed_mday; } tm.tm_hour = 0; tm.tm_min = 0; tm.tm_sec = 0; break; case GRN_TIME_CLASSIFY_UNIT_MONTH : tm.tm_mon = (tm.tm_mon / interval_raw) * interval_raw; tm.tm_mday = 1; tm.tm_hour = 0; tm.tm_min = 0; tm.tm_sec = 0; break; case GRN_TIME_CLASSIFY_UNIT_YEAR : tm.tm_year = (((1900 + tm.tm_year) / interval_raw) * interval_raw) - 1900; tm.tm_mon = 0; tm.tm_mday = 1; tm.tm_hour = 0; tm.tm_min = 0; tm.tm_sec = 0; break; } if (!grn_time_from_tm(ctx, &classed_time_raw, &tm)) { return NULL; } classed_time = grn_plugin_proc_alloc(ctx, user_data, time->header.domain, 0); if (!classed_time) { return NULL; } GRN_TIME_SET(ctx, classed_time, classed_time_raw); return classed_time; } }
/* This function is called for a full text search query or a document to be indexed. This means that both short/long strings are given. The return value of this function is ignored. When an error occurs in this function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). */ static grn_obj * mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_mecab_tokenizer *tokenizer; unsigned int normalizer_flags = 0; grn_tokenizer_query *query; grn_obj *normalized_query; const char *normalized_string; unsigned int normalized_string_length; query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags); if (!query) { return NULL; } if (!sole_mecab) { grn_plugin_mutex_lock(ctx, sole_mecab_mutex); if (!sole_mecab) { sole_mecab = mecab_new2("-Owakati"); if (!sole_mecab) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_new2() failed on mecab_init(): %s", mecab_global_error_message()); } else { sole_mecab_encoding = get_mecab_encoding(sole_mecab); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); } if (!sole_mecab) { grn_tokenizer_query_close(ctx, query); return NULL; } if (query->encoding != sole_mecab_encoding) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "MeCab dictionary charset (%s) does not match " "the table encoding: <%s>", grn_encoding_to_string(sole_mecab_encoding), grn_encoding_to_string(query->encoding)); return NULL; } if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][mecab] " "memory allocation to grn_mecab_tokenizer failed"); return NULL; } tokenizer->mecab = sole_mecab; tokenizer->query = query; normalized_query = query->normalized_query; grn_string_get_normalized(ctx, normalized_query, &normalized_string, &normalized_string_length, NULL); GRN_TEXT_INIT(&(tokenizer->buf), 0); if (query->have_tokenized_delimiter) { tokenizer->next = normalized_string; tokenizer->end = tokenizer->next + normalized_string_length; } else if (normalized_string_length == 0) { tokenizer->next = ""; tokenizer->end = tokenizer->next; } else { grn_bool succeeded; grn_plugin_mutex_lock(ctx, sole_mecab_mutex); if (grn_mecab_chunked_tokenize_enabled && ctx->encoding == GRN_ENC_UTF8) { succeeded = chunked_tokenize_utf8(ctx, tokenizer, normalized_string, normalized_string_length); } else { const char *s; s = mecab_sparse_tostr2(tokenizer->mecab, normalized_string, normalized_string_length); if (!s) { succeeded = GRN_FALSE; GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_sparse_tostr() failed len=%d err=%s", normalized_string_length, mecab_strerror(tokenizer->mecab)); } else { succeeded = GRN_TRUE; GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); if (!succeeded) { grn_tokenizer_query_close(ctx, tokenizer->query); GRN_PLUGIN_FREE(ctx, tokenizer); return NULL; } { char *buf, *p; unsigned int bufsize; buf = GRN_TEXT_VALUE(&(tokenizer->buf)); bufsize = GRN_TEXT_LEN(&(tokenizer->buf)); /* A certain version of mecab returns trailing lf or spaces. */ for (p = buf + bufsize - 2; buf <= p && isspace(*(unsigned char *)p); p--) { *p = '\0'; } tokenizer->next = buf; tokenizer->end = p + 1; } } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); return NULL; }