grn_rc grn_query_scan(grn_ctx *ctx, grn_query *q, const char **strs, unsigned int *str_lens, unsigned int nstrs, int flags, int *found, int *score) { unsigned int i; grn_rc rc; if (!q || !strs || !nstrs) { return GRN_INVALID_ARGUMENT; } *found = *score = 0; if (!q->snip_conds) { if ((rc = alloc_snip_conds(ctx, q))) { return rc; } flags |= GRN_QUERY_SCAN_ALLOCCONDS; } else if (flags & GRN_QUERY_SCAN_ALLOCCONDS) { GRN_LOG(ctx, GRN_LOG_WARNING, "invalid flags specified on grn_query_scan"); return GRN_INVALID_ARGUMENT; } for (i = 0; i < nstrs; i++) { grn_str *n; snip_cond *sc = q->snip_conds; int f = GRN_STR_WITH_CHECKS | GRN_STR_REMOVEBLANK; if (flags & GRN_QUERY_SCAN_NORMALIZE) { f |= GRN_STR_NORMALIZE; } n = grn_str_open(ctx, *(strs + i), *(str_lens + i), f); if (!n) { return GRN_NO_MEMORY_AVAILABLE; } if ((rc = scan_query(ctx, q, n, i + 1, q->expr, &sc, GRN_OP_OR, flags, found, score))) { grn_str_close(ctx, n); return rc; } flags &= ~GRN_QUERY_SCAN_ALLOCCONDS; grn_str_close(ctx, n); } return GRN_SUCCESS; }
void test_normalize(gconstpointer data) { const gchar *utf8_expected, *encoded_expected; const gchar *utf8_input, *encoded_input; grn_str *string; const gchar *normalized_text; guint normalized_text_len; int flags; grn_encoding encoding; encoding = gcut_data_get_int(data, "encoding"); GRN_CTX_SET_ENCODING(&context, encoding); flags = GRN_STR_NORMALIZE | GRN_STR_WITH_CHECKS | GRN_STR_WITH_CTYPES; utf8_input = gcut_data_get_string(data, "input"); encoded_input = convert_encoding(utf8_input, encoding); string = grn_str_open(&context, encoded_input, strlen(encoded_input), flags); normalized_text = cut_take_strndup(string->norm, string->norm_blen); normalized_text_len = string->norm_blen; grn_test_assert(grn_str_close(&context, string)); utf8_expected = gcut_data_get_string(data, "expected"); encoded_expected = convert_encoding(utf8_expected, encoding); cut_assert_equal_string(encoded_expected, normalized_text); cut_assert_equal_int(strlen(encoded_expected), normalized_text_len); }
static grn_obj * ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_ngram_tokenizer *token = user_data->ptr; grn_str_close(ctx, token->nstr); GRN_FREE(token); return NULL; }
static grn_rc ngram_fin(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data) { grn_ngram_tokenizer *token = user_data->ptr; grn_str_close(ctx, token->nstr); GRN_FREE(token); return GRN_SUCCESS; }
static grn_rc mecab_fin(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data) { grn_mecab_tokenizer *token = user_data->ptr; // if (token->mecab) { mecab_destroy(token->mecab); } grn_str_close(ctx, token->nstr); GRN_FREE(token->buf); GRN_FREE(token); return GRN_SUCCESS; }
void test_normalize_broken(gconstpointer data) { grn_str *string; const gchar *input, *encoded_input; grn_encoding input_encoding, context_encoding; gint input_length; int flags = GRN_STR_NORMALIZE | GRN_STR_WITH_CHECKS | GRN_STR_WITH_CTYPES; context_encoding = gcut_data_get_int(data, "context-encoding"); GRN_CTX_SET_ENCODING(&context, context_encoding); input = gcut_data_get_string(data, "input"); input_encoding = gcut_data_get_int(data, "input-encoding"); input_length = gcut_data_get_int(data, "input-length"); encoded_input = convert_encoding(input, input_encoding); if (input_length < 0) { input_length = strlen(encoded_input); } string = grn_str_open(&context, encoded_input, input_length, flags); cut_assert_equal_string("", string->norm); cut_assert_equal_int(0, string->norm_blen); grn_test_assert(grn_str_close(&context, string)); }
int grn_dat_scan(grn_ctx *ctx, grn_dat *dat, const char *str, unsigned int str_size, grn_dat_scan_hit *scan_hits, unsigned int max_num_scan_hits, const char **str_rest) { if (!grn_dat_open_trie_if_needed(ctx, dat) || !str || !(dat->obj.header.flags & GRN_OBJ_KEY_VAR_SIZE) || !scan_hits) { return -1; } grn::dat::Trie * const trie = static_cast<grn::dat::Trie *>(dat->trie); if (!trie) { return -1; } if (!max_num_scan_hits || !str_size) { if (str_rest) { *str_rest = str; } return 0; } int num_scan_hits = 0; try { if (dat->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) { grn_str * const normalized_str = grn_str_open( ctx, str, str_size, GRN_STR_NORMALIZE | GRN_STR_WITH_CHECKS); if (!normalized_str) { fprintf(stderr, "error: grn_str_open() failed!\n"); return -1; } str = normalized_str->norm; str_size = normalized_str->norm_blen; const short *checks = normalized_str->checks; unsigned int offset = 0; while (str_size) { if (*checks) { grn::dat::UInt32 key_pos; if (trie->lcp_search(str, str_size, &key_pos)) { const grn::dat::Key &key = trie->get_key(key_pos); const grn::dat::UInt32 key_length = key.length(); if ((key_length == str_size) || (checks[key_length])) { unsigned int length = 0; for (grn::dat::UInt32 i = 0; i < key_length; ++i) { if (checks[i] > 0) { length += checks[i]; } } scan_hits[num_scan_hits].id = key.id(); scan_hits[num_scan_hits].offset = offset; scan_hits[num_scan_hits].length = length; offset += length; str += key_length; str_size -= key_length; checks += key_length; if (++num_scan_hits >= max_num_scan_hits) { break; } continue; } } if (*checks > 0) { offset += *checks; } } ++str; --str_size; ++checks; } if (str_rest) { *str_rest = normalized_str->orig + offset; } grn_str_close(ctx, normalized_str); } else { const char * const begin = str; while (str_size) { grn::dat::UInt32 key_pos; if (trie->lcp_search(str, str_size, &key_pos)) { const grn::dat::Key &key = trie->get_key(key_pos); scan_hits[num_scan_hits].id = key.id(); scan_hits[num_scan_hits].offset = str - begin; scan_hits[num_scan_hits].length = key.length(); str += key.length(); str_size -= key.length(); if (++num_scan_hits >= max_num_scan_hits) { break; } } else { const int char_length = grn_charlen(ctx, str, str + str_size); if (char_length) { str += char_length; str_size -= char_length; } else { ++str; --str_size; } } } if (str_rest) { *str_rest = str; } } } catch (const grn::dat::Exception &ex) { ERR(grn_dat_translate_error_code(ex.code()), "grn::dat::lcp_search failed"); return -1; } return num_scan_hits; }
/* This function is called for a full text search query or a document to be indexed. This means that both short/long strings are given. The return value of this function is ignored. When an error occurs in this function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). */ static grn_obj * mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_obj *str; int nflags = 0; char *buf, *p; const char *s; grn_obj *table = args[0]; grn_obj_flags table_flags; grn_encoding table_encoding; grn_mecab_tokenizer *token; unsigned int bufsize, len; if (!(str = grn_ctx_pop(ctx))) { ERR(GRN_INVALID_ARGUMENT, "missing argument"); return NULL; } if (!sole_mecab) { CRITICAL_SECTION_ENTER(sole_mecab_lock); if (!sole_mecab) { sole_mecab = mecab_new2("-Owakati"); if (!sole_mecab) { ERR(GRN_TOKENIZER_ERROR, "mecab_new2 failed on grn_mecab_init: %s", mecab_strerror(NULL)); } else { sole_mecab_encoding = get_mecab_encoding(sole_mecab); } } CRITICAL_SECTION_LEAVE(sole_mecab_lock); } if (!sole_mecab) { return NULL; } grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL); if (table_encoding != sole_mecab_encoding) { ERR(GRN_TOKENIZER_ERROR, "MeCab dictionary charset (%s) does not match the context encoding: <%s>", grn_enctostr(sole_mecab_encoding), grn_enctostr(table_encoding)); return NULL; } if (!(token = GRN_MALLOC(sizeof(grn_mecab_tokenizer)))) { return NULL; } token->mecab = sole_mecab; token->encoding = table_encoding; nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE); if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str), nflags, token->encoding))) { GRN_FREE(token); ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open"); return NULL; } len = token->nstr->norm_blen; CRITICAL_SECTION_ENTER(sole_mecab_lock); s = mecab_sparse_tostr2(token->mecab, token->nstr->norm, len); if (!s) { ERR(GRN_TOKENIZER_ERROR, "mecab_sparse_tostr failed len=%d err=%s", len, mecab_strerror(token->mecab)); } else { bufsize = strlen(s) + 1; if (!(buf = GRN_MALLOC(bufsize))) { GRN_LOG(ctx, GRN_LOG_ALERT, "buffer allocation on mecab_init failed !"); } else { memcpy(buf, s, bufsize); } } CRITICAL_SECTION_LEAVE(sole_mecab_lock); if (!s || !buf) { grn_str_close(ctx, token->nstr); GRN_FREE(token); return NULL; } /* A certain version of mecab returns trailing lf or spaces. */ for (p = buf + bufsize - 2; buf <= p && isspace(*(unsigned char *)p); p--) { *p = '\0'; } user_data->ptr = token; token->buf = buf; token->next = buf; token->end = p + 1; GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY); GRN_UINT32_INIT(&token->stat_, 0); return NULL; }