static inline score_heap * score_heap_open(grn_ctx *ctx, int max) { score_heap *h = GRN_PLUGIN_MALLOC(ctx, sizeof(score_heap)); if (!h) { return NULL; } h->nodes = GRN_PLUGIN_MALLOC(ctx, sizeof(score_heap_node) * max); if (!h->nodes) { GRN_PLUGIN_FREE(ctx, h); return NULL; } h->n_entries = 0; h->limit = max; return h; }
grn_rc grn_tokenizer_query_set_raw_string(grn_ctx *ctx, grn_tokenizer_query *query, const char *string, size_t string_length) { GRN_API_ENTER; if (query->query_buf) { GRN_PLUGIN_FREE(ctx, query->query_buf); } if (string_length == 0) { query->query_buf = NULL; query->ptr = NULL; query->length = 0; query->need_normalize = GRN_TRUE; } else { query->query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, string_length + 1); if (!query->query_buf) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][query] failed to duplicate query"); GRN_API_RETURN(ctx->rc); } grn_memcpy(query->query_buf, string, string_length); query->query_buf[string_length] = '\0'; query->ptr = query->query_buf; query->length = string_length; } GRN_API_RETURN(ctx->rc); }
grn_tokenizer_query * grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args, unsigned int normalize_flags) { grn_obj *flags; grn_obj *query_str; grn_obj *tokenize_mode; GRN_API_ENTER; flags = grn_ctx_pop(ctx); query_str = grn_ctx_pop(ctx); tokenize_mode = grn_ctx_pop(ctx); if (query_str == NULL) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument"); GRN_API_RETURN(NULL); } if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer"); GRN_API_RETURN(NULL); } { grn_tokenizer_query * const query = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query)); if (!query) { GRN_API_RETURN(NULL); } grn_tokenizer_query_init(ctx, query); grn_tokenizer_query_set_raw_string(ctx, query, GRN_TEXT_VALUE(query_str), GRN_TEXT_LEN(query_str)); if (ctx->rc != GRN_SUCCESS) { GRN_PLUGIN_FREE(ctx, query); GRN_API_RETURN(NULL); } if (flags) { grn_tokenizer_query_set_flags(ctx, query, GRN_UINT32_VALUE(flags)); } if (tokenize_mode) { grn_tokenizer_query_set_mode(ctx, query, GRN_UINT32_VALUE(tokenize_mode)); } grn_tokenizer_query_set_normalize_flags(ctx, query, normalize_flags); grn_tokenizer_query_set_lexicon(ctx, query, args[0]); grn_tokenizer_query_ensure_have_tokenized_delimiter(ctx, query); GRN_API_RETURN(query); } }
static void command_schema_column_output_indexes(grn_ctx *ctx, grn_obj *column) { uint32_t i; grn_index_datum *index_data = NULL; uint32_t n_index_data = 0; if (column) { n_index_data = grn_column_get_all_index_data(ctx, column, NULL, 0); if (n_index_data > 0) { index_data = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_index_datum) * n_index_data); if (!index_data) { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[schema] failed to allocate memory for indexes"); return; } grn_column_get_all_index_data(ctx, column, index_data, n_index_data); } } grn_ctx_output_array_open(ctx, "indexes", n_index_data); for (i = 0; i < n_index_data; i++) { grn_obj *lexicon; grn_ctx_output_map_open(ctx, "index", 4); grn_ctx_output_cstr(ctx, "full_name"); command_schema_output_name(ctx, index_data[i].index); grn_ctx_output_cstr(ctx, "table"); lexicon = grn_ctx_at(ctx, index_data[i].index->header.domain); command_schema_output_name(ctx, lexicon); grn_ctx_output_cstr(ctx, "name"); command_schema_output_column_name(ctx, index_data[i].index); grn_ctx_output_cstr(ctx, "section"); grn_ctx_output_uint64(ctx, index_data[i].section); grn_ctx_output_map_close(ctx); } grn_ctx_output_array_close(ctx); if (index_data) { GRN_PLUGIN_FREE(ctx, index_data); } }
static void * stem_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode) { grn_stem_token_filter *token_filter; token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stem_token_filter)); if (!token_filter) { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[token-filter][stem] " "failed to allocate grn_stem_token_filter"); return NULL; } token_filter->stemmer = NULL; grn_tokenizer_token_init(ctx, &(token_filter->token)); return token_filter; }
static grn_obj * sample_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { sample_tokenizer *tokenizer; unsigned int normalizer_flags = 0; grn_tokenizer_query *query; grn_obj *normalized_query; const char *normalized_string; unsigned int normalized_string_length; query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags); if (!query) { return NULL; } tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(sample_tokenizer)); if (!tokenizer) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][sample] " "memory allocation to sample_tokenizer failed"); return NULL; } tokenizer->query = query; normalized_query = query->normalized_query; grn_string_get_normalized(ctx, normalized_query, &normalized_string, &normalized_string_length, NULL); tokenizer->next = normalized_string; tokenizer->rest = normalized_string_length; user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); return NULL; }
static uint32_t calc_edit_distance(grn_ctx *ctx, char *sx, char *ex, char *sy, char *ey, int flags) { int d = 0; uint32_t cx, lx, cy, ly, *dists; char *px, *py; for (px = sx, lx = 0; px < ex && (cx = grn_charlen(ctx, px, ex)); px += cx, lx++); for (py = sy, ly = 0; py < ey && (cy = grn_charlen(ctx, py, ey)); py += cy, ly++); if ((dists = GRN_PLUGIN_MALLOC(ctx, (lx + 1) * (ly + 1) * sizeof(uint32_t)))) { uint32_t x, y; for (x = 0; x <= lx; x++) { DIST(x, 0) = x; } for (y = 0; y <= ly; y++) { DIST(0, y) = y; } for (x = 1, px = sx; x <= lx; x++, px += cx) { cx = grn_charlen(ctx, px, ex); for (y = 1, py = sy; y <= ly; y++, py += cy) { cy = grn_charlen(ctx, py, ey); if (cx == cy && !memcmp(px, py, cx)) { DIST(x, y) = DIST(x - 1, y - 1); } else { uint32_t a = DIST(x - 1, y) + 1; uint32_t b = DIST(x, y - 1) + 1; uint32_t c = DIST(x - 1, y - 1) + 1; DIST(x, y) = ((a < b) ? ((a < c) ? a : c) : ((b < c) ? b : c)); if (flags & GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION && x > 1 && y > 1 && cx == cy && memcmp(px, py - cy, cx) == 0 && memcmp(px - cx, py, cx) == 0) { uint32_t t = DIST(x - 2, y - 2) + 1; DIST(x, y) = ((DIST(x, y) < t) ? DIST(x, y) : t); } } } } d = DIST(lx, ly); GRN_PLUGIN_FREE(ctx, dists); } return d; }
grn_tokenizer_query * grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args, unsigned int normalize_flags) { grn_obj *flags = grn_ctx_pop(ctx); grn_obj *query_str = grn_ctx_pop(ctx); grn_obj *tokenize_mode = grn_ctx_pop(ctx); if (query_str == NULL) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument"); return NULL; } if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer"); return NULL; } { grn_tokenizer_query * const query = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query)); if (query == NULL) { return NULL; } query->normalized_query = NULL; query->query_buf = NULL; if (flags) { query->flags = GRN_UINT32_VALUE(flags); } else { query->flags = 0; } if (tokenize_mode) { query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode); } else { query->tokenize_mode = GRN_TOKENIZE_ADD; } query->token_mode = query->tokenize_mode; { grn_obj * const table = args[0]; grn_obj_flags table_flags; grn_encoding table_encoding; unsigned int query_length = GRN_TEXT_LEN(query_str); char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1); grn_obj *normalizer = NULL; if (query_buf == NULL) { GRN_PLUGIN_FREE(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer] failed to duplicate query"); return NULL; } grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL, &normalizer, NULL); { grn_obj *normalized_query; if (table_flags & GRN_OBJ_KEY_NORMALIZE) { normalizer = GRN_NORMALIZER_AUTO; } normalized_query = grn_string_open_(ctx, GRN_TEXT_VALUE(query_str), GRN_TEXT_LEN(query_str), normalizer, normalize_flags, table_encoding); if (!normalized_query) { GRN_PLUGIN_FREE(ctx, query_buf); GRN_PLUGIN_FREE(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer] failed to open normalized string"); return NULL; } query->normalized_query = normalized_query; grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length); query_buf[query_length] = '\0'; query->query_buf = query_buf; query->ptr = query_buf; query->length = query_length; } query->encoding = table_encoding; if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) { const char *normalized_string; unsigned int normalized_string_length; grn_string_get_normalized(ctx, query->normalized_query, &normalized_string, &normalized_string_length, NULL); query->have_tokenized_delimiter = grn_tokenizer_have_tokenized_delimiter(ctx, normalized_string, normalized_string_length, query->encoding); } else { query->have_tokenized_delimiter = GRN_FALSE; } } return query; } }
static grn_obj * yangram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, unsigned short ngram_unit, grn_bool ignore_blank, grn_bool split_symbol, grn_bool split_alpha, grn_bool split_digit, grn_bool skip_overlap, unsigned short use_vgram) { grn_tokenizer_query *query; unsigned int normalize_flags = GRN_STRING_WITH_TYPES | GRN_STRING_REMOVE_TOKENIZED_DELIMITER | GRN_STRING_REMOVE_BLANK; const char *normalized; unsigned int normalized_length_in_bytes; grn_yangram_tokenizer *tokenizer; query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags); if (!query) { return NULL; } if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_yangram_tokenizer)))) { GRN_PLUGIN_ERROR(ctx,GRN_NO_MEMORY_AVAILABLE, "[tokenizer][yangram] " "memory allocation to grn_yangram_tokenizer failed"); grn_tokenizer_query_close(ctx, query); return NULL; } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); tokenizer->query = query; tokenizer->skip_overlap = skip_overlap; tokenizer->ignore_blank = ignore_blank; tokenizer->ngram_unit = ngram_unit; tokenizer->split_symbol = split_symbol; tokenizer->split_alpha = split_alpha; tokenizer->split_digit = split_digit; tokenizer->use_vgram = use_vgram; if (tokenizer->use_vgram > 0) { const char *vgram_word_table_name_env; vgram_word_table_name_env = getenv("GRN_VGRAM_WORD_TABLE_NAME"); if (vgram_word_table_name_env) { tokenizer->vgram_table = grn_ctx_get(ctx, vgram_word_table_name_env, strlen(vgram_word_table_name_env)); } else { tokenizer->vgram_table = grn_ctx_get(ctx, VGRAM_WORD_TABLE_NAME, strlen(VGRAM_WORD_TABLE_NAME)); } if (!tokenizer->vgram_table) { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][yangram] " "couldn't open a vgram table"); tokenizer->vgram_table = NULL; return NULL; } } else { tokenizer->vgram_table = NULL; } grn_string_get_normalized(ctx, tokenizer->query->normalized_query, &normalized, &normalized_length_in_bytes, NULL); { const char *phrase_table_name_env; phrase_table_name_env = getenv("GRN_KNOWN_PHRASE_TABLE_NAME"); if (phrase_table_name_env) { tokenizer->phrase_table = grn_ctx_get(ctx, phrase_table_name_env, strlen(phrase_table_name_env)); } else { tokenizer->phrase_table = grn_ctx_get(ctx, KNOWN_PHRASE_TABLE_NAME, strlen(KNOWN_PHRASE_TABLE_NAME)); } if (tokenizer->phrase_table) { if (!(tokenizer->hits = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_pat_scan_hit) * MAX_N_HITS))) { GRN_PLUGIN_ERROR(ctx,GRN_NO_MEMORY_AVAILABLE, "[tokenizer][yangram] " "memory allocation to grn_pat_scan_hit failed"); grn_tokenizer_query_close(ctx, query); return NULL; } else { tokenizer->scan_rest = normalized; tokenizer->nhits = 0; tokenizer->current_hit = 0; } } else { tokenizer->phrase_table = NULL; } } tokenizer->next = (const unsigned char *)normalized; tokenizer->end = tokenizer->next + normalized_length_in_bytes; tokenizer->rest_length = tokenizer->end - tokenizer->next; tokenizer->ctypes = grn_string_get_types(ctx, tokenizer->query->normalized_query); tokenizer->pushed_token_tail = NULL; tokenizer->ctypes_next = 0; return NULL; }
/* This function is called for a full text search query or a document to be indexed. This means that both short/long strings are given. The return value of this function is ignored. When an error occurs in this function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). */ static grn_obj * mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_mecab_tokenizer *tokenizer; unsigned int normalizer_flags = 0; grn_tokenizer_query *query; grn_obj *normalized_query; const char *normalized_string; unsigned int normalized_string_length; query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags); if (!query) { return NULL; } if (!sole_mecab) { grn_plugin_mutex_lock(ctx, sole_mecab_mutex); if (!sole_mecab) { sole_mecab = mecab_new2("-Owakati"); if (!sole_mecab) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_new2() failed on mecab_init(): %s", mecab_global_error_message()); } else { sole_mecab_encoding = get_mecab_encoding(sole_mecab); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); } if (!sole_mecab) { grn_tokenizer_query_close(ctx, query); return NULL; } if (query->encoding != sole_mecab_encoding) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "MeCab dictionary charset (%s) does not match " "the table encoding: <%s>", grn_encoding_to_string(sole_mecab_encoding), grn_encoding_to_string(query->encoding)); return NULL; } if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][mecab] " "memory allocation to grn_mecab_tokenizer failed"); return NULL; } tokenizer->mecab = sole_mecab; tokenizer->query = query; normalized_query = query->normalized_query; grn_string_get_normalized(ctx, normalized_query, &normalized_string, &normalized_string_length, NULL); GRN_TEXT_INIT(&(tokenizer->buf), 0); if (query->have_tokenized_delimiter) { tokenizer->next = normalized_string; tokenizer->end = tokenizer->next + normalized_string_length; } else if (normalized_string_length == 0) { tokenizer->next = ""; tokenizer->end = tokenizer->next; } else { grn_bool succeeded; grn_plugin_mutex_lock(ctx, sole_mecab_mutex); if (grn_mecab_chunked_tokenize_enabled && ctx->encoding == GRN_ENC_UTF8) { succeeded = chunked_tokenize_utf8(ctx, tokenizer, normalized_string, normalized_string_length); } else { const char *s; s = mecab_sparse_tostr2(tokenizer->mecab, normalized_string, normalized_string_length); if (!s) { succeeded = GRN_FALSE; GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_sparse_tostr() failed len=%d err=%s", normalized_string_length, mecab_strerror(tokenizer->mecab)); } else { succeeded = GRN_TRUE; GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); if (!succeeded) { grn_tokenizer_query_close(ctx, tokenizer->query); GRN_PLUGIN_FREE(ctx, tokenizer); return NULL; } { char *buf, *p; unsigned int bufsize; buf = GRN_TEXT_VALUE(&(tokenizer->buf)); bufsize = GRN_TEXT_LEN(&(tokenizer->buf)); /* A certain version of mecab returns trailing lf or spaces. */ for (p = buf + bufsize - 2; buf <= p && isspace(*(unsigned char *)p); p--) { *p = '\0'; } tokenizer->next = buf; tokenizer->end = p + 1; } } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); return NULL; }
/* This function is called for a full text search query or a document to be indexed. This means that both short/long strings are given. The return value of this function is ignored. When an error occurs in this function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). */ static grn_obj * mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { char *buf, *p; const char *s; grn_mecab_tokenizer *tokenizer; unsigned int bufsize; grn_tokenizer_query *query; grn_obj *normalized_query; const char *normalized_string; unsigned int normalized_string_length; query = grn_tokenizer_query_open(ctx, nargs, args); if (!query) { return NULL; } if (!sole_mecab) { grn_plugin_mutex_lock(ctx, sole_mecab_mutex); if (!sole_mecab) { sole_mecab = mecab_new2("-Owakati"); if (!sole_mecab) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_new2() failed on mecab_init(): %s", mecab_strerror(NULL)); } else { sole_mecab_encoding = get_mecab_encoding(sole_mecab); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); } if (!sole_mecab) { grn_tokenizer_query_close(ctx, query); return NULL; } if (query->encoding != sole_mecab_encoding) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "MeCab dictionary charset (%s) does not match " "the table encoding: <%s>", grn_enctostr(sole_mecab_encoding), grn_enctostr(query->encoding)); return NULL; } if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][mecab] " "memory allocation to grn_mecab_tokenizer failed"); return NULL; } tokenizer->mecab = sole_mecab; tokenizer->query = query; normalized_query = query->normalized_query; grn_string_get_normalized(ctx, normalized_query, &normalized_string, &normalized_string_length, NULL); tokenizer->have_tokenized_delimiter = grn_tokenizer_have_tokenized_delimiter(ctx, normalized_string, normalized_string_length, query->encoding); if (tokenizer->have_tokenized_delimiter) { tokenizer->buf = NULL; tokenizer->next = normalized_string; tokenizer->end = tokenizer->next + normalized_string_length; } else { grn_plugin_mutex_lock(ctx, sole_mecab_mutex); s = mecab_sparse_tostr2(tokenizer->mecab, normalized_string, normalized_string_length); if (!s) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_sparse_tostr() failed len=%d err=%s", normalized_string_length, mecab_strerror(tokenizer->mecab)); } else { bufsize = strlen(s) + 1; if (!(buf = GRN_PLUGIN_MALLOC(ctx, bufsize))) { GRN_PLUGIN_LOG(ctx, GRN_LOG_ALERT, "[tokenizer][mecab] " "buffer allocation on mecab_init failed !"); } else { memcpy(buf, s, bufsize); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); if (!s || !buf) { grn_tokenizer_query_close(ctx, tokenizer->query); GRN_PLUGIN_FREE(ctx, tokenizer); return NULL; } /* A certain version of mecab returns trailing lf or spaces. */ for (p = buf + bufsize - 2; buf <= p && isspace(*(unsigned char *)p); p--) { *p = '\0'; } tokenizer->buf = buf; tokenizer->next = buf; tokenizer->end = p + 1; } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); return NULL; }