const char * grn_tokenizer_next_by_tokenized_delimiter(grn_ctx *ctx, grn_token *token, const char *str_ptr, unsigned int str_length, grn_encoding encoding) { size_t char_length = 0; const char *start = str_ptr; const char *current; const char *end = str_ptr + str_length; const char *next_start = NULL; for (current = start; current < end; current += char_length) { char_length = grn_charlen_(ctx, current, end, encoding); if (char_length == 0) { break; } if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length, encoding)) { next_start = str_ptr + (current - start + char_length); break; } } grn_token_set_data(ctx, token, start, current - start); if (current == end) { grn_token_set_status(ctx, token, GRN_TOKEN_LAST); } else { grn_token_set_status(ctx, token, GRN_TOKEN_CONTINUE); } return next_start; }
static void stem_filter(grn_ctx *ctx, grn_token *current_token, grn_token *next_token, void *user_data) { grn_stem_token_filter *token_filter = user_data; grn_obj *data; if (GRN_CTX_GET_ENCODING(ctx) != GRN_ENC_UTF8) { return; } data = grn_token_get_data(ctx, current_token); if (token_filter->stemmer) { sb_stemmer_delete(token_filter->stemmer); } { /* TODO: Detect algorithm from the current token. */ const char *algorithm = "english"; const char *encoding = "UTF_8"; token_filter->stemmer = sb_stemmer_new(algorithm, encoding); if (!token_filter->stemmer) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[token-filter][stem] " "failed to create stemmer: " "algorithm=<%s>, encoding=<%s>", algorithm, encoding); return; } } { const sb_symbol *stemmed; stemmed = sb_stemmer_stem(token_filter->stemmer, GRN_TEXT_VALUE(data), GRN_TEXT_LEN(data)); if (stemmed) { grn_token_set_data(ctx, next_token, stemmed, sb_stemmer_length(token_filter->stemmer)); } else { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[token-filter][stem] " "failed to allocate memory for stemmed word: <%.*s>", (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data)); return; } } }