static grn_obj * uvector_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_obj *str, *flags, *mode; grn_uvector_tokenizer *tokenizer; if (!(flags = grn_ctx_pop(ctx))) { ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: flags"); return NULL; } if (!(str = grn_ctx_pop(ctx))) { ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: string"); return NULL; } if (!(mode = grn_ctx_pop(ctx))) { ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: mode"); return NULL; } if (!(tokenizer = GRN_MALLOC(sizeof(grn_uvector_tokenizer)))) { ERR(GRN_NO_MEMORY_AVAILABLE, "[tokenizer][uvector] " "memory allocation to grn_uvector_tokenizer failed"); return NULL; } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); tokenizer->curr = (byte *)GRN_TEXT_VALUE(str); tokenizer->tail = tokenizer->curr + GRN_TEXT_LEN(str); tokenizer->unit = sizeof(grn_id); return NULL; }
grn_tokenizer_query * grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args, unsigned int normalize_flags) { grn_obj *flags; grn_obj *query_str; grn_obj *tokenize_mode; GRN_API_ENTER; flags = grn_ctx_pop(ctx); query_str = grn_ctx_pop(ctx); tokenize_mode = grn_ctx_pop(ctx); if (query_str == NULL) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument"); GRN_API_RETURN(NULL); } if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer"); GRN_API_RETURN(NULL); } { grn_tokenizer_query * const query = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query)); if (!query) { GRN_API_RETURN(NULL); } grn_tokenizer_query_init(ctx, query); grn_tokenizer_query_set_raw_string(ctx, query, GRN_TEXT_VALUE(query_str), GRN_TEXT_LEN(query_str)); if (ctx->rc != GRN_SUCCESS) { GRN_PLUGIN_FREE(ctx, query); GRN_API_RETURN(NULL); } if (flags) { grn_tokenizer_query_set_flags(ctx, query, GRN_UINT32_VALUE(flags)); } if (tokenize_mode) { grn_tokenizer_query_set_mode(ctx, query, GRN_UINT32_VALUE(tokenize_mode)); } grn_tokenizer_query_set_normalize_flags(ctx, query, normalize_flags); grn_tokenizer_query_set_lexicon(ctx, query, args[0]); grn_tokenizer_query_ensure_have_tokenized_delimiter(ctx, query); GRN_API_RETURN(query); } }
static grn_obj * delimited_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t *delimiter, uint32_t delimiter_len) { grn_obj *str; int nflags = 0; grn_delimited_tokenizer *token; grn_obj_flags table_flags; if (!(str = grn_ctx_pop(ctx))) { ERR(GRN_INVALID_ARGUMENT, "missing argument"); return NULL; } if (!(token = GRN_MALLOC(sizeof(grn_delimited_tokenizer)))) { return NULL; } user_data->ptr = token; token->delimiter = delimiter; token->delimiter_len = delimiter_len; token->pos = 0; grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL); nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE); if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str), nflags, token->encoding))) { GRN_FREE(token); ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open"); return NULL; } token->next = (unsigned char *)token->nstr->norm; token->end = token->next + token->nstr->norm_blen; token->len = token->nstr->length; GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY); GRN_UINT32_INIT(&token->stat_, 0); return NULL; }
static grn_rc ngram_init(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data, uint8_t ngram_unit) { grn_obj *str; int nflags = GRN_STR_REMOVEBLANK|GRN_STR_WITH_CTYPES; grn_ngram_tokenizer *token; grn_obj_flags table_flags; if (!(str = grn_ctx_pop(ctx))) { return GRN_INVALID_ARGUMENT; } if (!(token = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) { return ctx->rc; } user_data->ptr = token; token->uni_alpha = 1; token->uni_digit = 1; token->uni_symbol = 1; token->ngram_unit = ngram_unit; token->overlap = 0; token->pos = 0; token->skip = 0; grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL); nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE); if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str), nflags, token->encoding))) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_str_open failed at grn_token_open"); return GRN_TOKENIZER_ERROR; } token->next = (unsigned char *)token->nstr->norm; token->end = token->next + token->nstr->norm_blen; token->ctypes = token->nstr->ctypes; token->len = token->nstr->length; GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY); GRN_UINT32_INIT(&token->stat_, 0); return GRN_SUCCESS; }
/* * call-seq: * context.pop -> 値 * * コンテキスト内にあるスタックから値を取り出す。このスタッ * クにはGroonga::Expression#executeの実行結果が格納される。 */ static VALUE rb_grn_context_pop (VALUE self) { grn_ctx *context; context = SELF(self); return GRNOBJ2RVAL(Qnil, context, grn_ctx_pop(context), self); }
static grn_rc uvector_init(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data) { grn_obj *str; grn_uvector_tokenizer_info *token; if (!(str = grn_ctx_pop(ctx))) { return GRN_INVALID_ARGUMENT; } if (!(token = GRN_MALLOC(sizeof(grn_uvector_tokenizer_info)))) { return ctx->rc; } user_data->ptr = token; token->curr = GRN_TEXT_VALUE(str); token->tail = token->curr + GRN_TEXT_LEN(str); token->unit = sizeof(grn_id); GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY); GRN_UINT32_INIT(&token->stat_, 0); return GRN_SUCCESS; }
static grn_obj * uvector_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_obj *str; grn_uvector_tokenizer_info *token; if (!(str = grn_ctx_pop(ctx))) { ERR(GRN_INVALID_ARGUMENT, "missing argument"); return NULL; } if (!(token = GRN_MALLOC(sizeof(grn_uvector_tokenizer_info)))) { return NULL; } user_data->ptr = token; token->curr = GRN_TEXT_VALUE(str); token->tail = token->curr + GRN_TEXT_LEN(str); token->unit = sizeof(grn_id); GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY); GRN_UINT32_INIT(&token->stat_, 0); return NULL; }
static grn_obj * ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram_unit, uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank) { grn_obj *str; int nflags = GRN_STR_REMOVEBLANK|GRN_STR_WITH_CTYPES; grn_ngram_tokenizer *token; grn_obj_flags table_flags; if (!(str = grn_ctx_pop(ctx))) { ERR(GRN_INVALID_ARGUMENT, "missing argument"); return NULL; } if (!(token = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) { return NULL; } user_data->ptr = token; token->uni_alpha = uni_alpha; token->uni_digit = uni_digit; token->uni_symbol = uni_symbol; token->ngram_unit = ngram_unit; token->ignore_blank = ignore_blank; token->overlap = 0; token->pos = 0; token->skip = 0; grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL); nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE); if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str), nflags, token->encoding))) { GRN_FREE(token); ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open"); return NULL; } token->next = (unsigned char *)token->nstr->norm; token->end = token->next + token->nstr->norm_blen; token->ctypes = token->nstr->ctypes; token->len = token->nstr->length; GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY); GRN_UINT32_INIT(&token->stat_, 0); return NULL; }
grn_rc grn_command_run(grn_ctx *ctx, grn_obj *command, grn_command_input *input) { grn_proc *proc; GRN_API_ENTER; proc = (grn_proc *)command; if (proc->callbacks.command.run) { proc->callbacks.command.run(ctx, command, input, proc->user_data); } else { /* TODO: REMOVE ME. For backward compatibility. */ uint32_t stack_curr = ctx->impl->stack_curr; grn_proc_call(ctx, command, 0, command); if (ctx->impl->stack_curr > stack_curr) { grn_ctx_pop(ctx); } } GRN_API_RETURN(ctx->rc); }
grn_id grn_token_next(grn_ctx *ctx, grn_token *token) { int status; grn_id tid = GRN_ID_NIL; grn_obj *table = token->table; grn_obj *tokenizer = token->tokenizer; while (token->status != grn_token_done) { if (tokenizer) { grn_obj *curr_, *stat_; ((grn_proc *)tokenizer)->funcs[PROC_NEXT](ctx, 1, &table, &token->pctx.user_data); stat_ = grn_ctx_pop(ctx); curr_ = grn_ctx_pop(ctx); token->curr = GRN_TEXT_VALUE(curr_); token->curr_size = GRN_TEXT_LEN(curr_); status = GRN_UINT32_VALUE(stat_); token->status = ((status & GRN_TOKEN_LAST) || (!token->add && (status & GRN_TOKEN_REACH_END))) ? grn_token_done : grn_token_doing; token->force_prefix = 0; if (status & GRN_TOKEN_UNMATURED) { if (status & GRN_TOKEN_OVERLAP) { if (!token->add) { token->pos++; continue; } } else { if (status & GRN_TOKEN_LAST) { token->force_prefix = 1; } } } } else { token->curr = token->orig; token->curr_size = token->orig_blen; token->status = grn_token_done; } if (token->add) { switch (table->header.type) { case GRN_TABLE_PAT_KEY : if (grn_io_lock(ctx, ((grn_pat *)table)->io, 10000000)) { tid = GRN_ID_NIL; } else { tid = grn_pat_add(ctx, (grn_pat *)table, token->curr, token->curr_size, NULL, NULL); grn_io_unlock(((grn_pat *)table)->io); } break; case GRN_TABLE_HASH_KEY : if (grn_io_lock(ctx, ((grn_hash *)table)->io, 10000000)) { tid = GRN_ID_NIL; } else { tid = grn_hash_add(ctx, (grn_hash *)table, token->curr, token->curr_size, NULL, NULL); grn_io_unlock(((grn_hash *)table)->io); } break; case GRN_TABLE_NO_KEY : if (token->curr_size == sizeof(grn_id)) { tid = *((grn_id *)token->curr); } else { tid = GRN_ID_NIL; } break; } } else { switch (table->header.type) { case GRN_TABLE_PAT_KEY : tid = grn_pat_get(ctx, (grn_pat *)table, token->curr, token->curr_size, NULL); break; case GRN_TABLE_HASH_KEY : tid = grn_hash_get(ctx, (grn_hash *)table, token->curr, token->curr_size, NULL); break; case GRN_TABLE_NO_KEY : if (token->curr_size == sizeof(grn_id)) { tid = *((grn_id *)token->curr); } else { tid = GRN_ID_NIL; } break; } } if (tid == GRN_ID_NIL && token->status != grn_token_done) { token->status = grn_token_not_found; } token->pos++; break; } return tid; }
/* This function is called for a full text search query or a document to be indexed. This means that both short/long strings are given. The return value of this function is ignored. When an error occurs in this function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). */ static grn_obj * mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_obj *str; int nflags = 0; char *buf, *p; const char *s; grn_obj *table = args[0]; grn_obj_flags table_flags; grn_encoding table_encoding; grn_mecab_tokenizer *token; unsigned int bufsize, len; if (!(str = grn_ctx_pop(ctx))) { ERR(GRN_INVALID_ARGUMENT, "missing argument"); return NULL; } if (!sole_mecab) { CRITICAL_SECTION_ENTER(sole_mecab_lock); if (!sole_mecab) { sole_mecab = mecab_new2("-Owakati"); if (!sole_mecab) { ERR(GRN_TOKENIZER_ERROR, "mecab_new2 failed on grn_mecab_init: %s", mecab_strerror(NULL)); } else { sole_mecab_encoding = get_mecab_encoding(sole_mecab); } } CRITICAL_SECTION_LEAVE(sole_mecab_lock); } if (!sole_mecab) { return NULL; } grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL); if (table_encoding != sole_mecab_encoding) { ERR(GRN_TOKENIZER_ERROR, "MeCab dictionary charset (%s) does not match the context encoding: <%s>", grn_enctostr(sole_mecab_encoding), grn_enctostr(table_encoding)); return NULL; } if (!(token = GRN_MALLOC(sizeof(grn_mecab_tokenizer)))) { return NULL; } token->mecab = sole_mecab; token->encoding = table_encoding; nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE); if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str), nflags, token->encoding))) { GRN_FREE(token); ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open"); return NULL; } len = token->nstr->norm_blen; CRITICAL_SECTION_ENTER(sole_mecab_lock); s = mecab_sparse_tostr2(token->mecab, token->nstr->norm, len); if (!s) { ERR(GRN_TOKENIZER_ERROR, "mecab_sparse_tostr failed len=%d err=%s", len, mecab_strerror(token->mecab)); } else { bufsize = strlen(s) + 1; if (!(buf = GRN_MALLOC(bufsize))) { GRN_LOG(ctx, GRN_LOG_ALERT, "buffer allocation on mecab_init failed !"); } else { memcpy(buf, s, bufsize); } } CRITICAL_SECTION_LEAVE(sole_mecab_lock); if (!s || !buf) { grn_str_close(ctx, token->nstr); GRN_FREE(token); return NULL; } /* A certain version of mecab returns trailing lf or spaces. */ for (p = buf + bufsize - 2; buf <= p && isspace(*(unsigned char *)p); p--) { *p = '\0'; } user_data->ptr = token; token->buf = buf; token->next = buf; token->end = p + 1; GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY); GRN_UINT32_INIT(&token->stat_, 0); return NULL; }
grn_tokenizer_query * grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args, unsigned int normalize_flags) { grn_obj *flags = grn_ctx_pop(ctx); grn_obj *query_str = grn_ctx_pop(ctx); grn_obj *tokenize_mode = grn_ctx_pop(ctx); if (query_str == NULL) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument"); return NULL; } if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer"); return NULL; } { grn_tokenizer_query * const query = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query)); if (query == NULL) { return NULL; } query->normalized_query = NULL; query->query_buf = NULL; if (flags) { query->flags = GRN_UINT32_VALUE(flags); } else { query->flags = 0; } if (tokenize_mode) { query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode); } else { query->tokenize_mode = GRN_TOKENIZE_ADD; } query->token_mode = query->tokenize_mode; { grn_obj * const table = args[0]; grn_obj_flags table_flags; grn_encoding table_encoding; unsigned int query_length = GRN_TEXT_LEN(query_str); char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1); grn_obj *normalizer = NULL; if (query_buf == NULL) { GRN_PLUGIN_FREE(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer] failed to duplicate query"); return NULL; } grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL, &normalizer, NULL); { grn_obj *normalized_query; if (table_flags & GRN_OBJ_KEY_NORMALIZE) { normalizer = GRN_NORMALIZER_AUTO; } normalized_query = grn_string_open_(ctx, GRN_TEXT_VALUE(query_str), GRN_TEXT_LEN(query_str), normalizer, normalize_flags, table_encoding); if (!normalized_query) { GRN_PLUGIN_FREE(ctx, query_buf); GRN_PLUGIN_FREE(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer] failed to open normalized string"); return NULL; } query->normalized_query = normalized_query; grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length); query_buf[query_length] = '\0'; query->query_buf = query_buf; query->ptr = query_buf; query->length = query_length; } query->encoding = table_encoding; if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) { const char *normalized_string; unsigned int normalized_string_length; grn_string_get_normalized(ctx, query->normalized_query, &normalized_string, &normalized_string_length, NULL); query->have_tokenized_delimiter = grn_tokenizer_have_tokenized_delimiter(ctx, normalized_string, normalized_string_length, query->encoding); } else { query->have_tokenized_delimiter = GRN_FALSE; } } return query; } }
void test_persistent_expr(void) { int i; grn_obj *t1, *t2, *c1, *c2, r1, r2, buf; t1 = grn_table_create(context, "t1", 2, NULL, GRN_OBJ_TABLE_NO_KEY|GRN_OBJ_PERSISTENT, NULL, NULL); cut_assert_not_null(t1); t2 = grn_table_create(context, "t2", 2, NULL, GRN_OBJ_TABLE_NO_KEY|GRN_OBJ_PERSISTENT, NULL, NULL); cut_assert_not_null(t2); c1 = grn_column_create(context, t1, "c1", 2, NULL, GRN_OBJ_PERSISTENT, t2); cut_assert_not_null(c1); c2 = grn_column_create(context, t2, "c2", 2, NULL, GRN_OBJ_PERSISTENT, t1); cut_assert_not_null(c2); GRN_TEXT_INIT(&buf, 0); GRN_RECORD_INIT(&r1, 0, grn_obj_id(context, t1)); GRN_RECORD_INIT(&r2, 0, grn_obj_id(context, t2)); for (i = 0; i < NRECORDS; i++) { grn_id i1, i2; i1 = grn_table_add(context, t1, NULL, 0, NULL); i2 = grn_table_add(context, t2, NULL, 0, NULL); GRN_RECORD_SET(context, &r1, i1); GRN_RECORD_SET(context, &r2, i2); grn_obj_set_value(context, c1, i1, &r2, GRN_OBJ_SET); grn_obj_set_value(context, c2, i2, &r1, GRN_OBJ_SET); } { grn_obj *v; expr = grn_expr_create(context, "test", 4); cut_assert_not_null(expr); v = grn_expr_add_var(context, expr, "foo", 3); GRN_RECORD_INIT(v, 0, grn_obj_id(context, t1)); grn_expr_append_obj(context, expr, v, GRN_OP_PUSH, 1); GRN_TEXT_SETS(context, &buf, "c1"); grn_expr_append_const(context, expr, &buf, GRN_OP_PUSH, 1); grn_expr_append_op(context, expr, GRN_OP_GET_VALUE, 2); GRN_TEXT_SETS(context, &buf, "c2"); grn_expr_append_const(context, expr, &buf, GRN_OP_PUSH, 1); grn_expr_append_op(context, expr, GRN_OP_GET_VALUE, 2); GRN_TEXT_SETS(context, &buf, "c1"); grn_expr_append_const(context, expr, &buf, GRN_OP_PUSH, 1); /* GRN_TEXT_SETS(context, &buf, "c1.c2.c1"); grn_expr_append_const(context, expr, &buf); */ grn_expr_append_op(context, expr, GRN_OP_GET_VALUE, 2); grn_expr_compile(context, expr); grn_test_assert(grn_obj_close(context, expr)); expr = NULL; } grn_test_assert(grn_obj_close(context, &buf)); grn_db_close(context, database); database = grn_db_open(context, path); { grn_id id; uint64_t et; int nerr = 0; grn_obj *r, *v; grn_table_cursor *tc; struct timeval tvb, tve; expr = get_object("test"); v = grn_expr_get_var(context, expr, "foo", 3); t1 = get_object("t1"); tc = grn_table_cursor_open(context, t1, NULL, 0, NULL, 0, 0, -1, 0); cut_assert_not_null(tc); gettimeofday(&tvb, NULL); while ((id = grn_table_cursor_next(context, tc))) { GRN_RECORD_SET(context, v, id); grn_expr_exec(context, expr, 0); r = grn_ctx_pop(context); if (GRN_RECORD_VALUE(r) != id) { nerr++; } } gettimeofday(&tve, NULL); et = (tve.tv_sec - tvb.tv_sec) * 1000000 + (tve.tv_usec - tvb.tv_usec); // printf("et=%zu\n", et); cut_assert_equal_uint(0, nerr); grn_test_assert(grn_table_cursor_close(context, tc)); } grn_test_assert(grn_obj_close(context, &r1)); grn_test_assert(grn_obj_close(context, &r2)); }
grn_id grn_token_next(grn_ctx *ctx, grn_token *token) { int status; grn_id tid = GRN_ID_NIL; grn_obj *table = token->table; grn_obj *tokenizer = token->tokenizer; while (token->status != GRN_TOKEN_DONE) { if (tokenizer) { grn_obj *curr_, *stat_; ((grn_proc *)tokenizer)->funcs[PROC_NEXT](ctx, 1, &table, &token->pctx.user_data); stat_ = grn_ctx_pop(ctx); curr_ = grn_ctx_pop(ctx); token->curr = (const unsigned char *)GRN_TEXT_VALUE(curr_); token->curr_size = GRN_TEXT_LEN(curr_); status = GRN_UINT32_VALUE(stat_); token->status = ((status & GRN_TOKENIZER_TOKEN_LAST) || (token->mode == GRN_TOKEN_GET && (status & GRN_TOKENIZER_TOKEN_REACH_END))) ? GRN_TOKEN_DONE : GRN_TOKEN_DOING; token->force_prefix = 0; if (token->curr_size == 0) { char tokenizer_name[GRN_TABLE_MAX_KEY_SIZE]; int tokenizer_name_length; tokenizer_name_length = grn_obj_name(ctx, token->tokenizer, tokenizer_name, GRN_TABLE_MAX_KEY_SIZE); GRN_LOG(ctx, GRN_WARN, "[token_next] ignore an empty token: <%.*s>: <%.*s>", tokenizer_name_length, tokenizer_name, token->orig_blen, token->orig); continue; } if (token->curr_size > GRN_TABLE_MAX_KEY_SIZE) { GRN_LOG(ctx, GRN_WARN, "[token_next] ignore too long token. " "Token must be less than or equal to %d: <%d>(<%.*s>)", GRN_TABLE_MAX_KEY_SIZE, token->curr_size, token->curr_size, token->curr); continue; } if (status & GRN_TOKENIZER_TOKEN_UNMATURED) { if (status & GRN_TOKENIZER_TOKEN_OVERLAP) { if (token->mode == GRN_TOKEN_GET) { token->pos++; continue; } } else { if (status & GRN_TOKENIZER_TOKEN_LAST) { token->force_prefix = 1; } } } } else { token->status = GRN_TOKEN_DONE; } if (token->mode == GRN_TOKEN_ADD) { switch (table->header.type) { case GRN_TABLE_PAT_KEY : if (grn_io_lock(ctx, ((grn_pat *)table)->io, grn_lock_timeout)) { tid = GRN_ID_NIL; } else { tid = grn_pat_add(ctx, (grn_pat *)table, token->curr, token->curr_size, NULL, NULL); grn_io_unlock(((grn_pat *)table)->io); } break; case GRN_TABLE_DAT_KEY : if (grn_io_lock(ctx, ((grn_dat *)table)->io, grn_lock_timeout)) { tid = GRN_ID_NIL; } else { tid = grn_dat_add(ctx, (grn_dat *)table, token->curr, token->curr_size, NULL, NULL); grn_io_unlock(((grn_dat *)table)->io); } break; case GRN_TABLE_HASH_KEY : if (grn_io_lock(ctx, ((grn_hash *)table)->io, grn_lock_timeout)) { tid = GRN_ID_NIL; } else { tid = grn_hash_add(ctx, (grn_hash *)table, token->curr, token->curr_size, NULL, NULL); grn_io_unlock(((grn_hash *)table)->io); } break; case GRN_TABLE_NO_KEY : if (token->curr_size == sizeof(grn_id)) { tid = *((grn_id *)token->curr); } else { tid = GRN_ID_NIL; } break; } } else { switch (table->header.type) { case GRN_TABLE_PAT_KEY : tid = grn_pat_get(ctx, (grn_pat *)table, token->curr, token->curr_size, NULL); break; case GRN_TABLE_DAT_KEY : tid = grn_dat_get(ctx, (grn_dat *)table, token->curr, token->curr_size, NULL); break; case GRN_TABLE_HASH_KEY : tid = grn_hash_get(ctx, (grn_hash *)table, token->curr, token->curr_size, NULL); break; case GRN_TABLE_NO_KEY : if (token->curr_size == sizeof(grn_id)) { tid = *((grn_id *)token->curr); } else { tid = GRN_ID_NIL; } break; } } if (tid == GRN_ID_NIL && token->status != GRN_TOKEN_DONE) { token->status = GRN_TOKEN_NOT_FOUND; } token->pos++; break; } return tid; }
static grn_obj * command_tag_synonym(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args, GNUC_UNUSED grn_user_data *user_data) { GNUC_UNUSED grn_obj *flags = grn_ctx_pop(ctx); grn_obj *newvalue = grn_ctx_pop(ctx); grn_obj *oldvalue = grn_ctx_pop(ctx); GNUC_UNUSED grn_obj *id = grn_ctx_pop(ctx); grn_obj buf; grn_obj record; grn_obj *domain; grn_obj *table; grn_obj *column; int i,n; if (GRN_BULK_VSIZE(newvalue) == 0 || GRN_INT32_VALUE(flags) == 0) { return NULL; } table = grn_ctx_at(ctx, oldvalue->header.domain); if (table && !is_table(table)) { GRN_PLUGIN_LOG(ctx, GRN_LOG_WARNING, "[tag-synonym] " "hooked column must be reference type"); return NULL; } column = grn_obj_column(ctx, table, SYNONYM_COLUMN_NAME, SYNONYM_COLUMN_NAME_LEN); if (!column) { GRN_PLUGIN_LOG(ctx, GRN_LOG_WARNING, "[tag-synonym] " "couldn't open synonym column"); return NULL; } GRN_TEXT_INIT(&buf, 0); domain = grn_ctx_at(ctx, newvalue->header.domain); if (domain && is_string(domain)) { GRN_RECORD_INIT(&record, GRN_OBJ_VECTOR, oldvalue->header.domain); grn_table_tokenize(ctx, table, GRN_TEXT_VALUE(newvalue), GRN_TEXT_LEN(newvalue), &record, GRN_TRUE); } else if (newvalue->header.type == GRN_UVECTOR) { record = *newvalue; } if (is_string(domain) || newvalue->header.type == GRN_UVECTOR) { grn_obj value; GRN_RECORD_INIT(newvalue, GRN_OBJ_VECTOR, oldvalue->header.domain); GRN_UINT32_INIT(&value, 0); n = grn_vector_size(ctx, &record); for (i = 0; i < n; i++) { grn_id tid; tid = grn_uvector_get_element(ctx, &record, i, NULL); GRN_BULK_REWIND(&value); grn_obj_get_value(ctx, column, tid, &value); if (GRN_UINT32_VALUE(&value)) { GRN_PLUGIN_LOG(ctx, GRN_LOG_INFO, "[tag-synonym] " "changed: tid %d -> %d", tid, GRN_UINT32_VALUE(&value)); tid = GRN_UINT32_VALUE(&value); } grn_uvector_add_element(ctx, newvalue, tid, 0); } grn_obj_unlink(ctx, &value); } else { grn_id tid; grn_obj value; tid = GRN_RECORD_VALUE(newvalue); GRN_UINT32_INIT(&value, 0); grn_obj_get_value(ctx, column, tid, &value); if (GRN_UINT32_VALUE(&value)) { GRN_PLUGIN_LOG(ctx, GRN_LOG_INFO, "[tag-synonym] " "changed: tid %d -> %d", tid, GRN_UINT32_VALUE(&value)); tid = GRN_UINT32_VALUE(&value); GRN_BULK_REWIND(newvalue); GRN_RECORD_SET(ctx, newvalue, tid); } grn_obj_unlink(ctx, &value); } grn_obj_unlink(ctx, &buf); return NULL; }
static grn_rc mecab_init(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data) { grn_obj *str; int nflags = 0; char *buf, *s, *p; char mecab_err[256]; grn_obj_flags table_flags; grn_mecab_tokenizer *token; unsigned int bufsize, maxtrial = 10, len; if (!(str = grn_ctx_pop(ctx))) { return GRN_INVALID_ARGUMENT; } SOLE_MECAB_CONFIRM; if (!sole_mecab) { GRN_LOG(ctx, GRN_LOG_ALERT, "mecab_new failed on grn_mecab_init"); return GRN_TOKENIZER_ERROR; } if (!(token = GRN_MALLOC(sizeof(grn_mecab_tokenizer)))) { return ctx->rc; } user_data->ptr = token; token->mecab = sole_mecab; // if (!(token->mecab = mecab_new3())) { grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL); nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE); if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str), nflags, token->encoding))) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_str_open failed at grn_token_open"); return GRN_TOKENIZER_ERROR; } len = token->nstr->norm_blen; mecab_err[sizeof(mecab_err) - 1] = '\0'; for (bufsize = len * 2 + 1; maxtrial; bufsize *= 2, maxtrial--) { if(!(buf = GRN_MALLOC(bufsize + 1))) { GRN_LOG(ctx, GRN_LOG_ALERT, "buffer allocation on mecab_init failed !"); GRN_FREE(token); return ctx->rc; } MUTEX_LOCK(sole_mecab_lock); s = mecab_sparse_tostr3(token->mecab, token->nstr->norm, len, buf, bufsize); if (!s) { strncpy(mecab_err, mecab_strerror(token->mecab), sizeof(mecab_err) - 1); } MUTEX_UNLOCK(sole_mecab_lock); if (s) { break; } GRN_FREE(buf); if (strstr(mecab_err, "output buffer overflow") == NULL) { break; } } if (!s) { GRN_LOG(ctx, GRN_LOG_ALERT, "mecab_sparse_tostr failed len=%d bufsize=%d err=%s", len, bufsize, mecab_err); GRN_FREE(token); return GRN_TOKENIZER_ERROR; } // certain version of mecab returns trailing lf or spaces. for (p = buf + strlen(buf) - 1; buf <= p && (*p == '\n' || isspace(*(unsigned char *)p)); p--) { *p = '\0'; } //grn_log("sparsed='%s'", s); token->buf = (unsigned char *)buf; token->next = (unsigned char *)buf; token->end = (unsigned char *)buf + strlen(buf); GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY); GRN_UINT32_INIT(&token->stat_, 0); return GRN_SUCCESS; }
grn_id grn_token_cursor_next(grn_ctx *ctx, grn_token_cursor *token_cursor) { int status; grn_id tid = GRN_ID_NIL; grn_obj *table = token_cursor->table; grn_obj *tokenizer = token_cursor->tokenizer; while (token_cursor->status != GRN_TOKEN_CURSOR_DONE) { if (tokenizer) { grn_obj *curr_, *stat_; ((grn_proc *)tokenizer)->funcs[PROC_NEXT](ctx, 1, &table, &token_cursor->pctx.user_data); stat_ = grn_ctx_pop(ctx); curr_ = grn_ctx_pop(ctx); status = grn_token_cursor_next_apply_token_filters(ctx, token_cursor, curr_, stat_); token_cursor->status = ((status & GRN_TOKEN_LAST) || (token_cursor->mode == GRN_TOKENIZE_GET && (status & GRN_TOKEN_REACH_END))) ? GRN_TOKEN_CURSOR_DONE : GRN_TOKEN_CURSOR_DOING; token_cursor->force_prefix = GRN_FALSE; #define SKIP_FLAGS \ (GRN_TOKEN_SKIP | GRN_TOKEN_SKIP_WITH_POSITION) if (status & SKIP_FLAGS) { if (status & GRN_TOKEN_SKIP) { token_cursor->pos++; } if (token_cursor->status == GRN_TOKEN_CURSOR_DONE && tid == GRN_ID_NIL) { token_cursor->status = GRN_TOKEN_CURSOR_DONE_SKIP; break; } else { continue; } } #undef SKIP_FLAGS if (status & GRN_TOKEN_FORCE_PREFIX) { token_cursor->force_prefix = GRN_TRUE; } if (token_cursor->curr_size == 0) { if (token_cursor->status != GRN_TOKEN_CURSOR_DONE) { char tokenizer_name[GRN_TABLE_MAX_KEY_SIZE]; int tokenizer_name_length; tokenizer_name_length = grn_obj_name(ctx, token_cursor->tokenizer, tokenizer_name, GRN_TABLE_MAX_KEY_SIZE); GRN_LOG(ctx, GRN_WARN, "[token_next] ignore an empty token: <%.*s>: <%.*s>", tokenizer_name_length, tokenizer_name, token_cursor->orig_blen, token_cursor->orig); } continue; } if (token_cursor->curr_size > GRN_TABLE_MAX_KEY_SIZE) { GRN_LOG(ctx, GRN_WARN, "[token_next] ignore too long token. " "Token must be less than or equal to %d: <%d>(<%.*s>)", GRN_TABLE_MAX_KEY_SIZE, token_cursor->curr_size, token_cursor->curr_size, token_cursor->curr); continue; } if (status & GRN_TOKEN_UNMATURED) { if (status & GRN_TOKEN_OVERLAP) { if (token_cursor->mode == GRN_TOKENIZE_GET) { token_cursor->pos++; continue; } } else { if (status & GRN_TOKEN_REACH_END) { token_cursor->force_prefix = GRN_TRUE; } } } } else { token_cursor->status = GRN_TOKEN_CURSOR_DONE; } if (token_cursor->mode == GRN_TOKENIZE_ADD) { switch (table->header.type) { case GRN_TABLE_PAT_KEY : if (grn_io_lock(ctx, ((grn_pat *)table)->io, grn_lock_timeout)) { tid = GRN_ID_NIL; } else { tid = grn_pat_add(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size, NULL, NULL); grn_io_unlock(((grn_pat *)table)->io); } break; case GRN_TABLE_DAT_KEY : if (grn_io_lock(ctx, ((grn_dat *)table)->io, grn_lock_timeout)) { tid = GRN_ID_NIL; } else { tid = grn_dat_add(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size, NULL, NULL); grn_io_unlock(((grn_dat *)table)->io); } break; case GRN_TABLE_HASH_KEY : if (grn_io_lock(ctx, ((grn_hash *)table)->io, grn_lock_timeout)) { tid = GRN_ID_NIL; } else { tid = grn_hash_add(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size, NULL, NULL); grn_io_unlock(((grn_hash *)table)->io); } break; case GRN_TABLE_NO_KEY : if (token_cursor->curr_size == sizeof(grn_id)) { tid = *((grn_id *)token_cursor->curr); } else { tid = GRN_ID_NIL; } break; } } else if (token_cursor->mode != GRN_TOKENIZE_ONLY) { switch (table->header.type) { case GRN_TABLE_PAT_KEY : tid = grn_pat_get(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size, NULL); break; case GRN_TABLE_DAT_KEY : tid = grn_dat_get(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size, NULL); break; case GRN_TABLE_HASH_KEY : tid = grn_hash_get(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size, NULL); break; case GRN_TABLE_NO_KEY : if (token_cursor->curr_size == sizeof(grn_id)) { tid = *((grn_id *)token_cursor->curr); } else { tid = GRN_ID_NIL; } break; } } if (token_cursor->mode != GRN_TOKENIZE_ONLY && tid == GRN_ID_NIL && token_cursor->status != GRN_TOKEN_CURSOR_DONE) { token_cursor->status = GRN_TOKEN_CURSOR_NOT_FOUND; } token_cursor->pos++; break; } return tid; }