Exemplo n.º 1
0
const char *
grn_tokenizer_next_by_tokenized_delimiter(grn_ctx *ctx,
                                          grn_token *token,
                                          const char *str_ptr,
                                          unsigned int str_length,
                                          grn_encoding encoding)
{
  size_t char_length = 0;
  const char *start = str_ptr;
  const char *current;
  const char *end = str_ptr + str_length;
  const char *next_start = NULL;

  for (current = start; current < end; current += char_length) {
    char_length = grn_charlen_(ctx, current, end, encoding);
    if (char_length == 0) {
      break;
    }
    if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length,
                                             encoding)) {
      next_start = str_ptr + (current - start + char_length);
      break;
    }
  }

  grn_token_set_data(ctx, token, start, current - start);
  if (current == end) {
    grn_token_set_status(ctx, token, GRN_TOKEN_LAST);
  } else {
    grn_token_set_status(ctx, token, GRN_TOKEN_CONTINUE);
  }

  return next_start;
}
Exemplo n.º 2
0
static void
stem_filter(grn_ctx *ctx,
            grn_token *current_token,
            grn_token *next_token,
            void *user_data)
{
  grn_stem_token_filter *token_filter = user_data;
  grn_obj *data;

  if (GRN_CTX_GET_ENCODING(ctx) != GRN_ENC_UTF8) {
    return;
  }

  data = grn_token_get_data(ctx, current_token);

  if (token_filter->stemmer) {
    sb_stemmer_delete(token_filter->stemmer);
  }
  {
    /* TODO: Detect algorithm from the current token. */
    const char *algorithm = "english";
    const char *encoding = "UTF_8";
    token_filter->stemmer = sb_stemmer_new(algorithm, encoding);
    if (!token_filter->stemmer) {
      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                       "[token-filter][stem] "
                       "failed to create stemmer: "
                       "algorithm=<%s>, encoding=<%s>",
                       algorithm, encoding);
      return;
    }
  }

  {
    const sb_symbol *stemmed;

    stemmed = sb_stemmer_stem(token_filter->stemmer,
                              GRN_TEXT_VALUE(data), GRN_TEXT_LEN(data));
    if (stemmed) {
      grn_token_set_data(ctx, next_token,
                         stemmed,
                         sb_stemmer_length(token_filter->stemmer));
    } else {
      GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                       "[token-filter][stem] "
                       "failed to allocate memory for stemmed word: <%.*s>",
                       (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data));
      return;
    }
  }
}