static grn_obj * ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_ngram_tokenizer *tokenizer = user_data->ptr; grn_tokenizer_token_fin(ctx, &(tokenizer->token)); grn_tokenizer_query_close(ctx, tokenizer->query); GRN_FREE(tokenizer); return NULL; }
static grn_obj * delimited_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_delimited_tokenizer *tokenizer = user_data->ptr; if (!tokenizer) { return NULL; } grn_tokenizer_query_close(ctx, tokenizer->query); grn_tokenizer_token_fin(ctx, &(tokenizer->token)); GRN_FREE(tokenizer); return NULL; }
/* This function finalizes a tokenization. */ static grn_obj * mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_mecab_tokenizer *tokenizer = user_data->ptr; grn_tokenizer_token_fin(ctx, &(tokenizer->token)); grn_tokenizer_query_close(ctx, tokenizer->query); if (tokenizer->buf) { GRN_PLUGIN_FREE(ctx, tokenizer->buf); } GRN_PLUGIN_FREE(ctx, tokenizer); return NULL; }
static grn_obj * regexp_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_regexp_tokenizer *tokenizer = user_data->ptr; if (!tokenizer) { return NULL; } grn_tokenizer_token_fin(ctx, &(tokenizer->token)); grn_tokenizer_query_close(ctx, tokenizer->query); GRN_OBJ_FIN(ctx, &(tokenizer->buffer)); GRN_FREE(tokenizer); return NULL; }
static grn_obj * ngram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, uint8_t ngram_unit, uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank) { unsigned int normalize_flags = GRN_STRING_REMOVE_BLANK | GRN_STRING_WITH_TYPES | GRN_STRING_REMOVE_TOKENIZED_DELIMITER; grn_tokenizer_query *query; const char *normalized; unsigned int normalized_length_in_bytes; grn_ngram_tokenizer *tokenizer; query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags); if (!query) { return NULL; } if (!(tokenizer = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) { grn_tokenizer_query_close(ctx, query); ERR(GRN_NO_MEMORY_AVAILABLE, "[tokenizer][ngram] " "memory allocation to grn_ngram_tokenizer failed"); return NULL; } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); tokenizer->query = query; tokenizer->uni_alpha = uni_alpha; tokenizer->uni_digit = uni_digit; tokenizer->uni_symbol = uni_symbol; tokenizer->ngram_unit = ngram_unit; tokenizer->ignore_blank = ignore_blank; tokenizer->overlap = 0; tokenizer->pos = 0; tokenizer->skip = 0; grn_string_get_normalized(ctx, tokenizer->query->normalized_query, &normalized, &normalized_length_in_bytes, &(tokenizer->len)); tokenizer->next = (const unsigned char *)normalized; tokenizer->end = tokenizer->next + normalized_length_in_bytes; tokenizer->ctypes = grn_string_get_types(ctx, tokenizer->query->normalized_query); return NULL; }
static grn_obj * sample_fin(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args, grn_user_data *user_data) { sample_tokenizer *tokenizer = user_data->ptr; if (!tokenizer) { return NULL; } grn_tokenizer_token_fin(ctx, &(tokenizer->token)); grn_tokenizer_query_close(ctx, tokenizer->query); GRN_PLUGIN_FREE(ctx, tokenizer); return NULL; }
static grn_obj * regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { unsigned int normalize_flags = GRN_STRING_WITH_TYPES; grn_tokenizer_query *query; const char *normalized; unsigned int normalized_length_in_bytes; grn_regexp_tokenizer *tokenizer; query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags); if (!query) { return NULL; } tokenizer = GRN_MALLOC(sizeof(grn_regexp_tokenizer)); if (!tokenizer) { grn_tokenizer_query_close(ctx, query); ERR(GRN_NO_MEMORY_AVAILABLE, "[tokenizer][regexp] failed to allocate memory"); return NULL; } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); tokenizer->query = query; tokenizer->get.n_skip_tokens = 0; tokenizer->is_begin = GRN_TRUE; tokenizer->is_end = GRN_FALSE; tokenizer->is_start_token = GRN_TRUE; tokenizer->is_overlapping = GRN_FALSE; grn_string_get_normalized(ctx, tokenizer->query->normalized_query, &normalized, &normalized_length_in_bytes, NULL); tokenizer->next = normalized; tokenizer->end = tokenizer->next + normalized_length_in_bytes; tokenizer->nth_char = 0; tokenizer->char_types = grn_string_get_types(ctx, tokenizer->query->normalized_query); GRN_TEXT_INIT(&(tokenizer->buffer), 0); return NULL; }
static grn_obj * delimited_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, const uint8_t *delimiter, uint32_t delimiter_len) { grn_tokenizer_query *query; unsigned int normalize_flags = 0; const char *normalized; unsigned int normalized_length_in_bytes; grn_delimited_tokenizer *tokenizer; query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags); if (!query) { return NULL; } if (!(tokenizer = GRN_MALLOC(sizeof(grn_delimited_tokenizer)))) { ERR(GRN_NO_MEMORY_AVAILABLE, "[tokenizer][delimit] " "memory allocation to grn_delimited_tokenizer failed"); grn_tokenizer_query_close(ctx, query); return NULL; } user_data->ptr = tokenizer; tokenizer->query = query; tokenizer->have_tokenized_delimiter = grn_tokenizer_have_tokenized_delimiter(ctx, tokenizer->query->ptr, tokenizer->query->length, tokenizer->query->encoding); tokenizer->delimiter = delimiter; tokenizer->delimiter_len = delimiter_len; grn_string_get_normalized(ctx, tokenizer->query->normalized_query, &normalized, &normalized_length_in_bytes, NULL); tokenizer->next = (const unsigned char *)normalized; tokenizer->end = tokenizer->next + normalized_length_in_bytes; grn_tokenizer_token_init(ctx, &(tokenizer->token)); return NULL; }
static grn_obj * sample_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { sample_tokenizer *tokenizer; unsigned int normalizer_flags = 0; grn_tokenizer_query *query; grn_obj *normalized_query; const char *normalized_string; unsigned int normalized_string_length; query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags); if (!query) { return NULL; } tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(sample_tokenizer)); if (!tokenizer) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][sample] " "memory allocation to sample_tokenizer failed"); return NULL; } tokenizer->query = query; normalized_query = query->normalized_query; grn_string_get_normalized(ctx, normalized_query, &normalized_string, &normalized_string_length, NULL); tokenizer->next = normalized_string; tokenizer->rest = normalized_string_length; user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); return NULL; }
static grn_obj * yangram_fin(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args, grn_user_data *user_data) { grn_yangram_tokenizer *tokenizer = user_data->ptr; if (!tokenizer) { return NULL; } if (tokenizer->vgram_table) { grn_obj_unlink(ctx, tokenizer->vgram_table); } if (tokenizer->phrase_table) { grn_obj_unlink(ctx, tokenizer->phrase_table); GRN_PLUGIN_FREE(ctx, tokenizer->hits); } grn_tokenizer_query_close(ctx, tokenizer->query); grn_tokenizer_token_fin(ctx, &(tokenizer->token)); GRN_PLUGIN_FREE(ctx, tokenizer); return NULL; }
static grn_obj * regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { unsigned int normalize_flags = 0; grn_tokenizer_query *query; const char *normalized; unsigned int normalized_length_in_bytes; grn_regexp_tokenizer *tokenizer; query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags); if (!query) { return NULL; } tokenizer = GRN_MALLOC(sizeof(grn_regexp_tokenizer)); if (!tokenizer) { grn_tokenizer_query_close(ctx, query); ERR(GRN_NO_MEMORY_AVAILABLE, "[tokenizer][regexp] failed to allocate memory"); return NULL; } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); tokenizer->query = query; tokenizer->get.have_begin = GRN_FALSE; tokenizer->get.have_end = GRN_FALSE; tokenizer->get.n_skip_tokens = 0; tokenizer->is_begin = GRN_TRUE; tokenizer->is_end = GRN_FALSE; tokenizer->is_first_token = GRN_TRUE; tokenizer->is_overlapping = GRN_FALSE; grn_string_get_normalized(ctx, tokenizer->query->normalized_query, &normalized, &normalized_length_in_bytes, NULL); tokenizer->next = normalized; tokenizer->end = tokenizer->next + normalized_length_in_bytes; if (tokenizer->query->tokenize_mode == GRN_TOKEN_GET) { unsigned int query_length = tokenizer->query->length; if (query_length >= 2) { const char *query_string = tokenizer->query->ptr; grn_encoding encoding = tokenizer->query->encoding; if (query_string[0] == '\\' && query_string[1] == 'A') { tokenizer->get.have_begin = GRN_TRUE; /* TODO: It assumes that both "\\" and "A" are normalized to 1 characters. Normalizer may omit character or expand to multiple characters. */ tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end, encoding); tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end, encoding); } if (query_string[query_length - 2] == '\\' && query_string[query_length - 1] == 'z') { tokenizer->get.have_end = GRN_TRUE; /* TODO: It assumes that both "\\" and "z" are normalized to 1 byte characters. Normalizer may omit character or expand to multiple characters. */ tokenizer->end -= grn_charlen_(ctx, tokenizer->end - 1, tokenizer->end, encoding); tokenizer->end -= grn_charlen_(ctx, tokenizer->end - 1, tokenizer->end, encoding); } } } GRN_TEXT_INIT(&(tokenizer->buffer), 0); return NULL; }
void grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query) { grn_tokenizer_query_close(ctx, query); }
static grn_obj * yangram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, unsigned short ngram_unit, grn_bool ignore_blank, grn_bool split_symbol, grn_bool split_alpha, grn_bool split_digit, grn_bool skip_overlap, unsigned short use_vgram) { grn_tokenizer_query *query; unsigned int normalize_flags = GRN_STRING_WITH_TYPES | GRN_STRING_REMOVE_TOKENIZED_DELIMITER | GRN_STRING_REMOVE_BLANK; const char *normalized; unsigned int normalized_length_in_bytes; grn_yangram_tokenizer *tokenizer; query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags); if (!query) { return NULL; } if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_yangram_tokenizer)))) { GRN_PLUGIN_ERROR(ctx,GRN_NO_MEMORY_AVAILABLE, "[tokenizer][yangram] " "memory allocation to grn_yangram_tokenizer failed"); grn_tokenizer_query_close(ctx, query); return NULL; } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); tokenizer->query = query; tokenizer->skip_overlap = skip_overlap; tokenizer->ignore_blank = ignore_blank; tokenizer->ngram_unit = ngram_unit; tokenizer->split_symbol = split_symbol; tokenizer->split_alpha = split_alpha; tokenizer->split_digit = split_digit; tokenizer->use_vgram = use_vgram; if (tokenizer->use_vgram > 0) { const char *vgram_word_table_name_env; vgram_word_table_name_env = getenv("GRN_VGRAM_WORD_TABLE_NAME"); if (vgram_word_table_name_env) { tokenizer->vgram_table = grn_ctx_get(ctx, vgram_word_table_name_env, strlen(vgram_word_table_name_env)); } else { tokenizer->vgram_table = grn_ctx_get(ctx, VGRAM_WORD_TABLE_NAME, strlen(VGRAM_WORD_TABLE_NAME)); } if (!tokenizer->vgram_table) { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][yangram] " "couldn't open a vgram table"); tokenizer->vgram_table = NULL; return NULL; } } else { tokenizer->vgram_table = NULL; } grn_string_get_normalized(ctx, tokenizer->query->normalized_query, &normalized, &normalized_length_in_bytes, NULL); { const char *phrase_table_name_env; phrase_table_name_env = getenv("GRN_KNOWN_PHRASE_TABLE_NAME"); if (phrase_table_name_env) { tokenizer->phrase_table = grn_ctx_get(ctx, phrase_table_name_env, strlen(phrase_table_name_env)); } else { tokenizer->phrase_table = grn_ctx_get(ctx, KNOWN_PHRASE_TABLE_NAME, strlen(KNOWN_PHRASE_TABLE_NAME)); } if (tokenizer->phrase_table) { if (!(tokenizer->hits = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_pat_scan_hit) * MAX_N_HITS))) { GRN_PLUGIN_ERROR(ctx,GRN_NO_MEMORY_AVAILABLE, "[tokenizer][yangram] " "memory allocation to grn_pat_scan_hit failed"); grn_tokenizer_query_close(ctx, query); return NULL; } else { tokenizer->scan_rest = normalized; tokenizer->nhits = 0; tokenizer->current_hit = 0; } } else { tokenizer->phrase_table = NULL; } } tokenizer->next = (const unsigned char *)normalized; tokenizer->end = tokenizer->next + normalized_length_in_bytes; tokenizer->rest_length = tokenizer->end - tokenizer->next; tokenizer->ctypes = grn_string_get_types(ctx, tokenizer->query->normalized_query); tokenizer->pushed_token_tail = NULL; tokenizer->ctypes_next = 0; return NULL; }
/* This function is called for a full text search query or a document to be indexed. This means that both short/long strings are given. The return value of this function is ignored. When an error occurs in this function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). */ static grn_obj * mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_mecab_tokenizer *tokenizer; unsigned int normalizer_flags = 0; grn_tokenizer_query *query; grn_obj *normalized_query; const char *normalized_string; unsigned int normalized_string_length; query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags); if (!query) { return NULL; } if (!sole_mecab) { grn_plugin_mutex_lock(ctx, sole_mecab_mutex); if (!sole_mecab) { sole_mecab = mecab_new2("-Owakati"); if (!sole_mecab) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_new2() failed on mecab_init(): %s", mecab_global_error_message()); } else { sole_mecab_encoding = get_mecab_encoding(sole_mecab); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); } if (!sole_mecab) { grn_tokenizer_query_close(ctx, query); return NULL; } if (query->encoding != sole_mecab_encoding) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "MeCab dictionary charset (%s) does not match " "the table encoding: <%s>", grn_encoding_to_string(sole_mecab_encoding), grn_encoding_to_string(query->encoding)); return NULL; } if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][mecab] " "memory allocation to grn_mecab_tokenizer failed"); return NULL; } tokenizer->mecab = sole_mecab; tokenizer->query = query; normalized_query = query->normalized_query; grn_string_get_normalized(ctx, normalized_query, &normalized_string, &normalized_string_length, NULL); GRN_TEXT_INIT(&(tokenizer->buf), 0); if (query->have_tokenized_delimiter) { tokenizer->next = normalized_string; tokenizer->end = tokenizer->next + normalized_string_length; } else if (normalized_string_length == 0) { tokenizer->next = ""; tokenizer->end = tokenizer->next; } else { grn_bool succeeded; grn_plugin_mutex_lock(ctx, sole_mecab_mutex); if (grn_mecab_chunked_tokenize_enabled && ctx->encoding == GRN_ENC_UTF8) { succeeded = chunked_tokenize_utf8(ctx, tokenizer, normalized_string, normalized_string_length); } else { const char *s; s = mecab_sparse_tostr2(tokenizer->mecab, normalized_string, normalized_string_length); if (!s) { succeeded = GRN_FALSE; GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_sparse_tostr() failed len=%d err=%s", normalized_string_length, mecab_strerror(tokenizer->mecab)); } else { succeeded = GRN_TRUE; GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); if (!succeeded) { grn_tokenizer_query_close(ctx, tokenizer->query); GRN_PLUGIN_FREE(ctx, tokenizer); return NULL; } { char *buf, *p; unsigned int bufsize; buf = GRN_TEXT_VALUE(&(tokenizer->buf)); bufsize = GRN_TEXT_LEN(&(tokenizer->buf)); /* A certain version of mecab returns trailing lf or spaces. */ for (p = buf + bufsize - 2; buf <= p && isspace(*(unsigned char *)p); p--) { *p = '\0'; } tokenizer->next = buf; tokenizer->end = p + 1; } } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); return NULL; }
/* This function is called for a full text search query or a document to be indexed. This means that both short/long strings are given. The return value of this function is ignored. When an error occurs in this function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). */ static grn_obj * mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { char *buf, *p; const char *s; grn_mecab_tokenizer *tokenizer; unsigned int bufsize; grn_tokenizer_query *query; grn_obj *normalized_query; const char *normalized_string; unsigned int normalized_string_length; query = grn_tokenizer_query_open(ctx, nargs, args); if (!query) { return NULL; } if (!sole_mecab) { grn_plugin_mutex_lock(ctx, sole_mecab_mutex); if (!sole_mecab) { sole_mecab = mecab_new2("-Owakati"); if (!sole_mecab) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_new2() failed on mecab_init(): %s", mecab_strerror(NULL)); } else { sole_mecab_encoding = get_mecab_encoding(sole_mecab); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); } if (!sole_mecab) { grn_tokenizer_query_close(ctx, query); return NULL; } if (query->encoding != sole_mecab_encoding) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "MeCab dictionary charset (%s) does not match " "the table encoding: <%s>", grn_enctostr(sole_mecab_encoding), grn_enctostr(query->encoding)); return NULL; } if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][mecab] " "memory allocation to grn_mecab_tokenizer failed"); return NULL; } tokenizer->mecab = sole_mecab; tokenizer->query = query; normalized_query = query->normalized_query; grn_string_get_normalized(ctx, normalized_query, &normalized_string, &normalized_string_length, NULL); tokenizer->have_tokenized_delimiter = grn_tokenizer_have_tokenized_delimiter(ctx, normalized_string, normalized_string_length, query->encoding); if (tokenizer->have_tokenized_delimiter) { tokenizer->buf = NULL; tokenizer->next = normalized_string; tokenizer->end = tokenizer->next + normalized_string_length; } else { grn_plugin_mutex_lock(ctx, sole_mecab_mutex); s = mecab_sparse_tostr2(tokenizer->mecab, normalized_string, normalized_string_length); if (!s) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_sparse_tostr() failed len=%d err=%s", normalized_string_length, mecab_strerror(tokenizer->mecab)); } else { bufsize = strlen(s) + 1; if (!(buf = GRN_PLUGIN_MALLOC(ctx, bufsize))) { GRN_PLUGIN_LOG(ctx, GRN_LOG_ALERT, "[tokenizer][mecab] " "buffer allocation on mecab_init failed !"); } else { memcpy(buf, s, bufsize); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); if (!s || !buf) { grn_tokenizer_query_close(ctx, tokenizer->query); GRN_PLUGIN_FREE(ctx, tokenizer); return NULL; } /* A certain version of mecab returns trailing lf or spaces. */ for (p = buf + bufsize - 2; buf <= p && isspace(*(unsigned char *)p); p--) { *p = '\0'; } tokenizer->buf = buf; tokenizer->next = buf; tokenizer->end = p + 1; } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); return NULL; }