Exemplo n.º 1
0
static grn_bool
exec_text_operator_raw_text_raw_text(grn_ctx *ctx,
                                     grn_operator op,
                                     const char *target,
                                     unsigned int target_len,
                                     const char *query,
                                     unsigned int query_len)
{
  grn_obj *normalizer;
  grn_obj *norm_target;
  grn_obj *norm_query;
  const char *norm_target_raw;
  const char *norm_query_raw;
  unsigned int norm_target_raw_length_in_bytes;
  unsigned int norm_query_raw_length_in_bytes;
  grn_bool matched = GRN_FALSE;

  if (target_len == 0 || query_len == 0) {
    return GRN_FALSE;
  }

  normalizer = grn_ctx_get(ctx, GRN_NORMALIZER_AUTO_NAME, -1);
  norm_target = grn_string_open(ctx, target, target_len, normalizer, 0);
  grn_string_get_normalized(ctx, norm_target,
                            &norm_target_raw,
                            &norm_target_raw_length_in_bytes,
                            NULL);

  if (op == GRN_OP_REGEXP) {
    norm_query = NULL;
    norm_query_raw = query;
    norm_query_raw_length_in_bytes = query_len;
  } else {
    norm_query = grn_string_open(ctx, query,  query_len,  normalizer, 0);
    grn_string_get_normalized(ctx, norm_query,
                              &norm_query_raw,
                              &norm_query_raw_length_in_bytes,
                              NULL);
  }

  matched = exec_text_operator(ctx, op,
                               norm_target_raw,
                               norm_target_raw_length_in_bytes,
                               norm_query_raw,
                               norm_query_raw_length_in_bytes);

  grn_obj_close(ctx, norm_target);
  if (norm_query) {
    grn_obj_close(ctx, norm_query);
  }
  grn_obj_unlink(ctx, normalizer);

  return matched;
}
Exemplo n.º 2
0
void
test_remove_tokenized_delimiter(gconstpointer data)
{
  grn_obj *string;
  grn_obj *normalizer = NULL;
  const gchar *expected;
  const gchar *input;
  const gchar *normalized;
  unsigned int length_in_bytes;
  int flags = GRN_STRING_REMOVE_TOKENIZED_DELIMITER;

  GRN_CTX_SET_ENCODING(&context, GRN_ENC_UTF8);

  input = gcut_data_get_string(data, "input");
  flags |= gcut_data_get_int(data, "flags");
  if (flags & GRN_OBJ_KEY_NORMALIZE) {
    normalizer = GRN_NORMALIZER_AUTO;
  }

  string = grn_string_open(&context, input, strlen(input), normalizer, flags);
  grn_string_get_normalized(&context, string,
                            &normalized, &length_in_bytes, NULL);
  normalized = cut_take_strndup(normalized, length_in_bytes);
  grn_obj_unlink(&context, string);

  expected = gcut_data_get_string(data, "expected");
  cut_assert_equal_string(expected, normalized);
}
Exemplo n.º 3
0
/*
 * Normalizes the @string@.
 *
 * @example
 *   # Normalizes "ABC" with the default normalizer
 *   Groonga::Normalizer.normalize("AbC") # => "abc"
 *
 * @overload normalize(string)
 *   @return [String] The normalized string
 *   @param [String] string The original string
 */
static VALUE
rb_grn_normalizer_s_normalize (VALUE klass, VALUE rb_string)
{
    VALUE rb_context = Qnil;
    VALUE rb_encoded_string;
    VALUE rb_normalized_string;
    grn_ctx *context = NULL;
    grn_obj *grn_string;
    grn_obj *normalizer = GRN_NORMALIZER_AUTO;
    /* TODO: make customizable */
    int flags = GRN_STRING_REMOVE_BLANK;
    const char *normalized_string;
    unsigned int normalized_string_length;

    context = rb_grn_context_ensure(&rb_context);
    rb_encoded_string = rb_grn_context_rb_string_encode(context, rb_string);
    grn_string = grn_string_open(context,
                                 RSTRING_PTR(rb_encoded_string),
                                 RSTRING_LEN(rb_encoded_string),
                                 normalizer,
                                 flags);
    rb_grn_context_check(context, rb_string);
    grn_string_get_normalized(context, grn_string,
                              &normalized_string, &normalized_string_length,
                              NULL);
    rb_normalized_string =
        rb_grn_context_rb_string_new(context,
                                     normalized_string,
                                     normalized_string_length);
    grn_obj_close(context, grn_string);

    return rb_normalized_string;
}
Exemplo n.º 4
0
grn_rc
grn_snip_cond_init(grn_ctx *ctx, snip_cond *sc, const char *keyword, unsigned int keyword_len,
                   grn_encoding enc, grn_obj *normalizer, int flags)
{
  const char *norm;
  unsigned int norm_blen;
  int f = GRN_STR_REMOVEBLANK;
  memset(sc, 0, sizeof(snip_cond));
  if (!(sc->keyword = grn_string_open(ctx, keyword, keyword_len,
                                      normalizer, f))) {
    GRN_LOG(ctx, GRN_LOG_ALERT,
            "grn_string_open on snip_cond_init failed!");
    return GRN_NO_MEMORY_AVAILABLE;
  }
  grn_string_get_normalized(ctx, sc->keyword, &norm, &norm_blen, NULL);
  if (!norm_blen) {
    grn_snip_cond_close(ctx, sc);
    return GRN_INVALID_ARGUMENT;
  }
  if (norm_blen != 1) {
    grn_bm_preBmBc((unsigned char *)norm, norm_blen, sc->bmBc);
    sc->shift = sc->bmBc[(unsigned char)norm[norm_blen - 1]];
    sc->bmBc[(unsigned char)norm[norm_blen - 1]] = 0;
  }
  return GRN_SUCCESS;
}
Exemplo n.º 5
0
static void
grn_tokenizer_query_ensure_have_tokenized_delimiter(grn_ctx *ctx,
                                                    grn_tokenizer_query *query)
{
  grn_tokenizer_query_ensure_normalized(ctx, query);

  if (!query->need_delimiter_check) {
    return;
  }

  query->need_delimiter_check = GRN_FALSE;

  if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) {
    const char *normalized_string;
    unsigned int normalized_string_length;

    grn_string_get_normalized(ctx,
                              query->normalized_query,
                              &normalized_string,
                              &normalized_string_length,
                              NULL);
    query->have_tokenized_delimiter =
      grn_tokenizer_have_tokenized_delimiter(ctx,
                                             normalized_string,
                                             normalized_string_length,
                                             query->encoding);
  } else {
    query->have_tokenized_delimiter = GRN_FALSE;
  }
}
Exemplo n.º 6
0
void
test_normalize_broken(gconstpointer data)
{
  grn_obj *string;
  const gchar *input, *encoded_input;
  const gchar *normalized_text;
  grn_encoding input_encoding, context_encoding;
  gint input_length;
  guint normalized_text_length, normalized_text_n_characters;
  int flags = GRN_STRING_WITH_CHECKS | GRN_STRING_WITH_TYPES;

  context_encoding = gcut_data_get_int(data, "context-encoding");
  GRN_CTX_SET_ENCODING(&context, context_encoding);

  input = gcut_data_get_string(data, "input");
  input_encoding = gcut_data_get_int(data, "input-encoding");
  input_length = gcut_data_get_int(data, "input-length");
  encoded_input = convert_encoding(input, input_encoding);
  if (input_length < 0) {
    input_length = strlen(encoded_input);
  }
  string = grn_string_open(&context, encoded_input, input_length,
                           GRN_NORMALIZER_AUTO, flags);
  grn_string_get_normalized(&context, string,
                            &normalized_text,
                            &normalized_text_length,
                            &normalized_text_n_characters);
  normalized_text = cut_take_strndup(normalized_text, normalized_text_length);
  grn_obj_unlink(&context, string);

  cut_assert_equal_string("", normalized_text);
  cut_assert_equal_int(0, normalized_text_length);
  cut_assert_equal_int(0, normalized_text_n_characters);
}
Exemplo n.º 7
0
grn_rc
grn_snip_add_cond(grn_ctx *ctx, grn_snip *snip,
                  const char *keyword, unsigned int keyword_len,
                  const char *opentag, unsigned int opentag_len,
                  const char *closetag, unsigned int closetag_len)
{
  grn_rc rc;
  int copy_tag;
  snip_cond *cond;
  unsigned int norm_blen;

  if (!snip || !keyword || !keyword_len || snip->cond_len >= MAX_SNIP_COND_COUNT) {
    return GRN_INVALID_ARGUMENT;
  }
  cond = snip->cond + snip->cond_len;
  if ((rc = grn_snip_cond_init(ctx, cond, keyword, keyword_len,
                               snip->encoding, snip->normalizer, snip->flags))) {
    return rc;
  }
  grn_string_get_normalized(ctx, cond->keyword, NULL, &norm_blen, NULL);
  if (norm_blen > snip->width) {
    grn_snip_cond_close(ctx, cond);
    return GRN_INVALID_ARGUMENT;
  }

  copy_tag = snip->flags & GRN_SNIP_COPY_TAG;
  rc = grn_snip_cond_set_tag(ctx,
                             &(cond->opentag), &(cond->opentag_len),
                             opentag, opentag_len,
                             snip->defaultopentag, snip->defaultopentag_len,
                             copy_tag);
  if (rc) {
    grn_snip_cond_close(ctx, cond);
    return rc;
  }

  rc = grn_snip_cond_set_tag(ctx,
                             &(cond->closetag), &(cond->closetag_len),
                             closetag, closetag_len,
                             snip->defaultclosetag, snip->defaultclosetag_len,
                             copy_tag);
  if (rc) {
    if (opentag && copy_tag) {
      GRN_FREE((void *)cond->opentag);
    }
    grn_snip_cond_close(ctx, cond);
    return rc;
  }

  snip->cond_len++;
  return GRN_SUCCESS;
}
Exemplo n.º 8
0
MRN_API char *mroonga_normalize(UDF_INIT *initid, UDF_ARGS *args, char *result,
                                unsigned long *length, char *is_null, char *error)
{
  st_mrn_normalize_info *info = (st_mrn_normalize_info *)initid->ptr;
  grn_ctx *ctx = info->ctx;
  String *result_str = &(info->result_str);

  if (!args->args[0]) {
    *is_null = 1;
    return NULL;
  }

  result_str->length(0);
  {
    char *target = args->args[0];
    unsigned int target_length = args->lengths[0];
    grn_obj *grn_string;
    const char *normalized;
    unsigned int normalized_length_in_bytes;
    unsigned int normalized_n_characters;

    grn_string = grn_string_open(ctx,
                                 target, target_length,
                                 info->normalizer, info->flags);
    grn_string_get_normalized(ctx, grn_string,
                              &normalized,
                              &normalized_length_in_bytes,
                              &normalized_n_characters);
    if (result_str->reserve(normalized_length_in_bytes)) {
      my_error(ER_OUT_OF_RESOURCES, MYF(0), HA_ERR_OUT_OF_MEM);
      goto error;
    }
    result_str->q_append(normalized, normalized_length_in_bytes);
    result_str->length(normalized_length_in_bytes);
    grn_obj_unlink(ctx, grn_string);
  }
  *is_null = 0;

  if (ctx->rc) {
    my_message(ER_ERROR_ON_WRITE, ctx->errbuf, MYF(0));
    goto error;
  }

  *length = result_str->length();
  return (char *)result_str->ptr();

error:
  *is_null = 1;
  *error = 1;
  return NULL;
}
Exemplo n.º 9
0
static grn_obj *
ngram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, uint8_t ngram_unit,
           uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank)
{
  unsigned int normalize_flags =
    GRN_STRING_REMOVE_BLANK |
    GRN_STRING_WITH_TYPES |
    GRN_STRING_REMOVE_TOKENIZED_DELIMITER;
  grn_tokenizer_query *query;
  const char *normalized;
  unsigned int normalized_length_in_bytes;
  grn_ngram_tokenizer *tokenizer;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
  if (!query) {
    return NULL;
  }

  if (!(tokenizer = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    ERR(GRN_NO_MEMORY_AVAILABLE,
        "[tokenizer][ngram] "
        "memory allocation to grn_ngram_tokenizer failed");
    return NULL;
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));
  tokenizer->query = query;

  tokenizer->uni_alpha = uni_alpha;
  tokenizer->uni_digit = uni_digit;
  tokenizer->uni_symbol = uni_symbol;
  tokenizer->ngram_unit = ngram_unit;
  tokenizer->ignore_blank = ignore_blank;
  tokenizer->overlap = 0;
  tokenizer->pos = 0;
  tokenizer->skip = 0;

  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
                            &normalized, &normalized_length_in_bytes,
                            &(tokenizer->len));
  tokenizer->next = (const unsigned char *)normalized;
  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
  tokenizer->ctypes =
    grn_string_get_types(ctx, tokenizer->query->normalized_query);
  return NULL;
}
Exemplo n.º 10
0
static grn_obj *
regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  unsigned int normalize_flags = GRN_STRING_WITH_TYPES;
  grn_tokenizer_query *query;
  const char *normalized;
  unsigned int normalized_length_in_bytes;
  grn_regexp_tokenizer *tokenizer;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
  if (!query) {
    return NULL;
  }

  tokenizer = GRN_MALLOC(sizeof(grn_regexp_tokenizer));
  if (!tokenizer) {
    grn_tokenizer_query_close(ctx, query);
    ERR(GRN_NO_MEMORY_AVAILABLE,
        "[tokenizer][regexp] failed to allocate memory");
    return NULL;
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));
  tokenizer->query = query;

  tokenizer->get.n_skip_tokens = 0;

  tokenizer->is_begin = GRN_TRUE;
  tokenizer->is_end   = GRN_FALSE;
  tokenizer->is_start_token = GRN_TRUE;
  tokenizer->is_overlapping = GRN_FALSE;

  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
                            &normalized, &normalized_length_in_bytes,
                            NULL);
  tokenizer->next = normalized;
  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
  tokenizer->nth_char = 0;
  tokenizer->char_types =
    grn_string_get_types(ctx, tokenizer->query->normalized_query);

  GRN_TEXT_INIT(&(tokenizer->buffer), 0);

  return NULL;
}
Exemplo n.º 11
0
static grn_obj *
delimited_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data,
               const uint8_t *delimiter, uint32_t delimiter_len)
{
  grn_tokenizer_query *query;
  unsigned int normalize_flags = 0;
  const char *normalized;
  unsigned int normalized_length_in_bytes;
  grn_delimited_tokenizer *tokenizer;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
  if (!query) {
    return NULL;
  }

  if (!(tokenizer = GRN_MALLOC(sizeof(grn_delimited_tokenizer)))) {
    ERR(GRN_NO_MEMORY_AVAILABLE,
        "[tokenizer][delimit] "
        "memory allocation to grn_delimited_tokenizer failed");
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }
  user_data->ptr = tokenizer;

  tokenizer->query = query;

  tokenizer->have_tokenized_delimiter =
    grn_tokenizer_have_tokenized_delimiter(ctx,
                                           tokenizer->query->ptr,
                                           tokenizer->query->length,
                                           tokenizer->query->encoding);
  tokenizer->delimiter = delimiter;
  tokenizer->delimiter_len = delimiter_len;
  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
                            &normalized, &normalized_length_in_bytes,
                            NULL);
  tokenizer->next = (const unsigned char *)normalized;
  tokenizer->end = tokenizer->next + normalized_length_in_bytes;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}
Exemplo n.º 12
0
static grn_obj *
sample_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  sample_tokenizer *tokenizer;
  unsigned int normalizer_flags = 0;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
  if (!query) {
    return NULL;
  }

  tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(sample_tokenizer));
  if (!tokenizer) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][sample] "
                     "memory allocation to sample_tokenizer failed");
    return NULL;
  }

  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  tokenizer->next = normalized_string;
  tokenizer->rest = normalized_string_length;

  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}
Exemplo n.º 13
0
void
test_normalize(gconstpointer data)
{
  const gchar *utf8_expected, *encoded_expected;
  const gchar *utf8_input, *encoded_input;
  grn_obj *string;
  const gchar *normalized_text;
  guint normalized_text_length;
  guint normalized_text_n_characters;
  int flags;
  grn_encoding encoding;

  encoding = gcut_data_get_int(data, "encoding");
  GRN_CTX_SET_ENCODING(&context, encoding);
  flags = GRN_STRING_WITH_CHECKS | GRN_STRING_WITH_TYPES;
  utf8_input = gcut_data_get_string(data, "input");
  encoded_input = convert_encoding(utf8_input, encoding);
  string = grn_string_open(&context,
                           encoded_input,
                           strlen(encoded_input),
                           GRN_NORMALIZER_AUTO,
                           flags);
  grn_string_get_normalized(&context, string,
                            &normalized_text,
                            &normalized_text_length,
                            &normalized_text_n_characters);
  normalized_text = cut_take_strndup(normalized_text, normalized_text_length);
  grn_obj_unlink(&context, string);

  utf8_expected = gcut_data_get_string(data, "expected");
  encoded_expected = convert_encoding(utf8_expected, encoding);
  cut_assert_equal_string(encoded_expected, normalized_text);
  cut_assert_equal_uint(strlen(encoded_expected), normalized_text_length);
  cut_assert_equal_uint(g_utf8_strlen(utf8_expected, -1),
                        normalized_text_n_characters);
}
Exemplo n.º 14
0
static grn_obj *
regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  unsigned int normalize_flags = 0;
  grn_tokenizer_query *query;
  const char *normalized;
  unsigned int normalized_length_in_bytes;
  grn_regexp_tokenizer *tokenizer;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
  if (!query) {
    return NULL;
  }

  tokenizer = GRN_MALLOC(sizeof(grn_regexp_tokenizer));
  if (!tokenizer) {
    grn_tokenizer_query_close(ctx, query);
    ERR(GRN_NO_MEMORY_AVAILABLE,
        "[tokenizer][regexp] failed to allocate memory");
    return NULL;
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));
  tokenizer->query = query;

  tokenizer->get.have_begin = GRN_FALSE;
  tokenizer->get.have_end   = GRN_FALSE;
  tokenizer->get.n_skip_tokens = 0;

  tokenizer->is_begin = GRN_TRUE;
  tokenizer->is_end   = GRN_FALSE;
  tokenizer->is_first_token = GRN_TRUE;
  tokenizer->is_overlapping = GRN_FALSE;

  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
                            &normalized, &normalized_length_in_bytes,
                            NULL);
  tokenizer->next = normalized;
  tokenizer->end = tokenizer->next + normalized_length_in_bytes;

  if (tokenizer->query->tokenize_mode == GRN_TOKEN_GET) {
    unsigned int query_length = tokenizer->query->length;
    if (query_length >= 2) {
      const char *query_string = tokenizer->query->ptr;
      grn_encoding encoding = tokenizer->query->encoding;
      if (query_string[0] == '\\' && query_string[1] == 'A') {
        tokenizer->get.have_begin = GRN_TRUE;
        /* TODO: It assumes that both "\\" and "A" are normalized to 1
           characters. Normalizer may omit character or expand to
           multiple characters. */
        tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end,
                                        encoding);
        tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end,
                                        encoding);
      }
      if (query_string[query_length - 2] == '\\' &&
          query_string[query_length - 1] == 'z') {
        tokenizer->get.have_end = GRN_TRUE;
        /* TODO: It assumes that both "\\" and "z" are normalized to 1
           byte characters. Normalizer may omit character or expand to
           multiple characters. */
        tokenizer->end -= grn_charlen_(ctx,
                                       tokenizer->end - 1,
                                       tokenizer->end,
                                       encoding);
        tokenizer->end -= grn_charlen_(ctx,
                                       tokenizer->end - 1,
                                       tokenizer->end,
                                       encoding);
      }
    }
  }

  GRN_TEXT_INIT(&(tokenizer->buffer), 0);

  return NULL;
}
Exemplo n.º 15
0
grn_tokenizer_query *
grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
                         unsigned int normalize_flags)
{
  grn_obj *flags = grn_ctx_pop(ctx);
  grn_obj *query_str = grn_ctx_pop(ctx);
  grn_obj *tokenize_mode = grn_ctx_pop(ctx);

  if (query_str == NULL) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument");
    return NULL;
  }

  if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer");
    return NULL;
  }

  {
    grn_tokenizer_query * const query =
        GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query));
    if (query == NULL) {
      return NULL;
    }
    query->normalized_query = NULL;
    query->query_buf = NULL;
    if (flags) {
      query->flags = GRN_UINT32_VALUE(flags);
    } else {
      query->flags = 0;
    }
    if (tokenize_mode) {
      query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode);
    } else {
      query->tokenize_mode = GRN_TOKENIZE_ADD;
    }
    query->token_mode = query->tokenize_mode;

    {
      grn_obj * const table = args[0];
      grn_obj_flags table_flags;
      grn_encoding table_encoding;
      unsigned int query_length = GRN_TEXT_LEN(query_str);
      char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1);
      grn_obj *normalizer = NULL;

      if (query_buf == NULL) {
        GRN_PLUGIN_FREE(ctx, query);
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer] failed to duplicate query");
        return NULL;
      }
      grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL,
                         &normalizer, NULL);
      {
        grn_obj *normalized_query;
        if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
          normalizer = GRN_NORMALIZER_AUTO;
        }
        normalized_query = grn_string_open_(ctx,
                                            GRN_TEXT_VALUE(query_str),
                                            GRN_TEXT_LEN(query_str),
                                            normalizer,
                                            normalize_flags,
                                            table_encoding);
        if (!normalized_query) {
          GRN_PLUGIN_FREE(ctx, query_buf);
          GRN_PLUGIN_FREE(ctx, query);
          GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                           "[tokenizer] failed to open normalized string");
          return NULL;
        }
        query->normalized_query = normalized_query;
        grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length);
        query_buf[query_length] = '\0';
        query->query_buf = query_buf;
        query->ptr = query_buf;
        query->length = query_length;
      }
      query->encoding = table_encoding;

      if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) {
        const char *normalized_string;
        unsigned int normalized_string_length;

        grn_string_get_normalized(ctx,
                                  query->normalized_query,
                                  &normalized_string,
                                  &normalized_string_length,
                                  NULL);
        query->have_tokenized_delimiter =
          grn_tokenizer_have_tokenized_delimiter(ctx,
                                                 normalized_string,
                                                 normalized_string_length,
                                                 query->encoding);
      } else {
        query->have_tokenized_delimiter = GRN_FALSE;
      }
    }
    return query;
  }
}
Exemplo n.º 16
0
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  char *buf, *p;
  const char *s;
  grn_mecab_tokenizer *tokenizer;
  unsigned int bufsize;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args);
  if (!query) {
    return NULL;
  }
  if (!sole_mecab) {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_new2() failed on mecab_init(): %s",
                         mecab_strerror(NULL));
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
  }
  if (!sole_mecab) {
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }

  if (query->encoding != sole_mecab_encoding) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "MeCab dictionary charset (%s) does not match "
                     "the table encoding: <%s>",
                     grn_enctostr(sole_mecab_encoding),
                     grn_enctostr(query->encoding));
    return NULL;
  }

  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][mecab] "
                     "memory allocation to grn_mecab_tokenizer failed");
    return NULL;
  }
  tokenizer->mecab = sole_mecab;
  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  tokenizer->have_tokenized_delimiter =
    grn_tokenizer_have_tokenized_delimiter(ctx,
                                           normalized_string,
                                           normalized_string_length,
                                           query->encoding);

  if (tokenizer->have_tokenized_delimiter) {
    tokenizer->buf = NULL;
    tokenizer->next = normalized_string;
    tokenizer->end = tokenizer->next + normalized_string_length;
  } else {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    s = mecab_sparse_tostr2(tokenizer->mecab,
                            normalized_string,
                            normalized_string_length);
    if (!s) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                       "[tokenizer][mecab] "
                       "mecab_sparse_tostr() failed len=%d err=%s",
                       normalized_string_length,
                       mecab_strerror(tokenizer->mecab));
    } else {
      bufsize = strlen(s) + 1;
      if (!(buf = GRN_PLUGIN_MALLOC(ctx, bufsize))) {
        GRN_PLUGIN_LOG(ctx, GRN_LOG_ALERT,
                       "[tokenizer][mecab] "
                       "buffer allocation on mecab_init failed !");
      } else {
        memcpy(buf, s, bufsize);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
    if (!s || !buf) {
      grn_tokenizer_query_close(ctx, tokenizer->query);
      GRN_PLUGIN_FREE(ctx, tokenizer);
      return NULL;
    }
    /* A certain version of mecab returns trailing lf or spaces. */
    for (p = buf + bufsize - 2;
         buf <= p && isspace(*(unsigned char *)p);
         p--) { *p = '\0'; }
    tokenizer->buf = buf;
    tokenizer->next = buf;
    tokenizer->end = p + 1;
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}
Exemplo n.º 17
0
void
grn_bm_tunedbm(grn_ctx *ctx, snip_cond *cond, grn_obj *string, int flags)
{
  register unsigned char *limit, ck;
  register const unsigned char *p, *cp;
  register size_t *bmBc, delta1, i;

  const unsigned char *x;
  unsigned char *y;
  size_t shift, found;

  const char *string_original;
  unsigned int string_original_length_in_bytes;
  const short *string_checks;
  grn_encoding string_encoding;
  const char *string_norm, *keyword_norm;
  unsigned int n, m;

  grn_string_get_original(ctx, string,
                          &string_original, &string_original_length_in_bytes);
  string_checks = grn_string_get_checks(ctx, string);
  string_encoding = grn_string_get_encoding(ctx, string);
  grn_string_get_normalized(ctx, string, &string_norm, &n, NULL);
  grn_string_get_normalized(ctx, cond->keyword, &keyword_norm, &m, NULL);

  y = (unsigned char *)string_norm;
  if (m == 1) {
    if (n > cond->found) {
      shift = 1;
      p = memchr(y + cond->found, keyword_norm[0], n - cond->found);
      if (p != NULL) {
        found = p - y;
        GRN_BM_COMPARE;
      }
    }
    cond->stopflag = SNIPCOND_STOP;
    return;
  }

  x = (unsigned char *)keyword_norm;
  bmBc = cond->bmBc;
  shift = cond->shift;

  /* Restart */
  p = y + m + cond->found;
  cp = x + m;
  ck = cp[-2];

  /* 12 means 1(initial offset) + 10 (in loop) + 1 (shift) */
  if (n - cond->found > 12 * m) {
    limit = y + n - 11 * m;
    while (p <= limit) {
      p += bmBc[p[-1]];
      if(!(delta1 = bmBc[p[-1]])) {
        goto check;
      }
      p += delta1;
      p += bmBc[p[-1]];
      p += bmBc[p[-1]];
      if(!(delta1 = bmBc[p[-1]])) {
        goto check;
      }
      p += delta1;
      p += bmBc[p[-1]];
      p += bmBc[p[-1]];
      if(!(delta1 = bmBc[p[-1]])) {
        goto check;
      }
      p += delta1;
      p += bmBc[p[-1]];
      p += bmBc[p[-1]];
      continue;
    check:
      GRN_BM_BM_COMPARE;
      p += shift;
    }
  }
  /* limit check + search */
  limit = y + n;
  while(p <= limit) {
    if (!(delta1 = bmBc[p[-1]])) {
      GRN_BM_BM_COMPARE;
      p += shift;
    }
    p += delta1;
  }
  cond->stopflag = SNIPCOND_STOP;
}
Exemplo n.º 18
0
static grn_bool
exec_regexp_vector_bulk(grn_ctx *ctx, grn_obj *vector, grn_obj *pattern)
{
#ifdef GRN_SUPPORT_REGEXP
  grn_obj *normalizer = NULL;
  grn_bool matched = GRN_FALSE;
  unsigned int i, size;
  OnigRegex regex;

  size = grn_vector_size(ctx, vector);
  if (size == 0) {
    return GRN_FALSE;
  }

  regex = grn_onigmo_new(ctx,
                         GRN_TEXT_VALUE(pattern),
                         GRN_TEXT_LEN(pattern),
                         GRN_ONIGMO_OPTION_DEFAULT,
                         GRN_ONIGMO_SYNTAX_DEFAULT,
                         "[operator]");
  if (!regex) {
    return GRN_FALSE;
  }

  normalizer = grn_ctx_get(ctx, GRN_NORMALIZER_AUTO_NAME, -1);
  for (i = 0; i < size; i++) {
    const char *content;
    unsigned int content_size;
    grn_id domain_id;
    grn_obj *norm_content;
    const char *norm_content_raw;
    unsigned int norm_content_raw_length_in_bytes;

    content_size = grn_vector_get_element(ctx, vector, i,
                                          &content, NULL, &domain_id);
    if (content_size == 0) {
      continue;
    }

    norm_content = grn_string_open(ctx, content, content_size, normalizer, 0);
    grn_string_get_normalized(ctx, norm_content,
                              &norm_content_raw,
                              &norm_content_raw_length_in_bytes,
                              NULL);

    matched = regexp_is_match(ctx, regex,
                              norm_content_raw,
                              norm_content_raw_length_in_bytes);

    grn_obj_unlink(ctx, norm_content);

    if (matched) {
      break;
    }
  }
  grn_obj_unlink(ctx, normalizer);

  onig_free(regex);

  return matched;
#else /* GRN_SUPPORT_REGEXP */
  return GRN_FALSE;
#endif /* GRN_SUPPORT_REGEXP */
}
Exemplo n.º 19
0
static grn_bool
exec_regexp_uvector_bulk(grn_ctx *ctx, grn_obj *uvector, grn_obj *pattern)
{
#ifdef GRN_SUPPORT_REGEXP
  grn_bool matched = GRN_FALSE;
  unsigned int i, size;
  OnigRegex regex;
  grn_obj *domain;
  grn_obj *normalizer;
  grn_obj *normalizer_auto = NULL;

  size = grn_uvector_size(ctx, uvector);
  if (size == 0) {
    return GRN_FALSE;
  }

  regex = grn_onigmo_new(ctx,
                         GRN_TEXT_VALUE(pattern),
                         GRN_TEXT_LEN(pattern),
                         GRN_ONIGMO_OPTION_DEFAULT,
                         GRN_ONIGMO_SYNTAX_DEFAULT,
                         "[operator]");
  if (!regex) {
    return GRN_FALSE;
  }

  domain = grn_ctx_at(ctx, uvector->header.domain);
  if (!domain) {
    onig_free(regex);
    return GRN_FALSE;
  }

  grn_table_get_info(ctx, domain, NULL, NULL, NULL, &normalizer, NULL);
  if (!normalizer) {
    normalizer_auto = grn_ctx_get(ctx, GRN_NORMALIZER_AUTO_NAME, -1);
  }

  for (i = 0; i < size; i++) {
    grn_id record_id;
    char key[GRN_TABLE_MAX_KEY_SIZE];
    int key_size;

    record_id = grn_uvector_get_element(ctx, uvector, i, NULL);
    key_size = grn_table_get_key(ctx, domain, record_id,
                                 key, GRN_TABLE_MAX_KEY_SIZE);
    if (key_size == 0) {
      continue;
    }

    if (normalizer) {
      matched = regexp_is_match(ctx, regex, key, key_size);
    } else {
      grn_obj *norm_key;
      const char *norm_key_raw;
      unsigned int norm_key_raw_length_in_bytes;

      norm_key = grn_string_open(ctx, key, key_size, normalizer_auto, 0);
      grn_string_get_normalized(ctx, norm_key,
                                &norm_key_raw,
                                &norm_key_raw_length_in_bytes,
                                NULL);
      matched = regexp_is_match(ctx, regex,
                                norm_key_raw,
                                norm_key_raw_length_in_bytes);
      grn_obj_unlink(ctx, norm_key);
    }

    if (matched) {
      break;
    }
  }

  if (normalizer_auto) {
    grn_obj_unlink(ctx, normalizer_auto);
  }

  grn_obj_unlink(ctx, domain);

  onig_free(regex);

  return matched;
#else /* GRN_SUPPORT_REGEXP */
  return GRN_FALSE;
#endif /* GRN_SUPPORT_REGEXP */
}
Exemplo n.º 20
0
static grn_bool
exec_text_operator_record_text(grn_ctx *ctx,
                               grn_operator op,
                               grn_obj *record, grn_obj *table,
                               grn_obj *query)
{
  grn_obj *normalizer;
  char record_key[GRN_TABLE_MAX_KEY_SIZE];
  int record_key_len;
  grn_bool matched = GRN_FALSE;

  if (table->header.domain != GRN_DB_SHORT_TEXT) {
    return GRN_FALSE;
  }

  if (GRN_TEXT_LEN(query) == 0) {
    return GRN_FALSE;
  }

  record_key_len = grn_table_get_key(ctx, table, GRN_RECORD_VALUE(record),
                                     record_key, GRN_TABLE_MAX_KEY_SIZE);
  grn_table_get_info(ctx, table, NULL, NULL, NULL, &normalizer, NULL);
  if (normalizer) {
    grn_obj *norm_query;
    const char *norm_query_raw;
    unsigned int norm_query_raw_length_in_bytes;

    if (op == GRN_OP_REGEXP) {
      norm_query = NULL;
      norm_query_raw = GRN_TEXT_VALUE(query);
      norm_query_raw_length_in_bytes = GRN_TEXT_LEN(query);
    } else {
      norm_query = grn_string_open(ctx,
                                   GRN_TEXT_VALUE(query),
                                   GRN_TEXT_LEN(query),
                                   table,
                                   0);
      grn_string_get_normalized(ctx, norm_query,
                                &norm_query_raw,
                                &norm_query_raw_length_in_bytes,
                                NULL);
    }
    matched = exec_text_operator(ctx,
                                 op,
                                 record_key,
                                 record_key_len,
                                 norm_query_raw,
                                 norm_query_raw_length_in_bytes);
    if (norm_query) {
      grn_obj_close(ctx, norm_query);
    }
  } else {
    matched = exec_text_operator_raw_text_raw_text(ctx,
                                                   op,
                                                   record_key,
                                                   record_key_len,
                                                   GRN_TEXT_VALUE(query),
                                                   GRN_TEXT_LEN(query));
  }

  return matched;
}
Exemplo n.º 21
0
    int
    grn_dat_scan(grn_ctx *ctx, grn_dat *dat, const char *str,
                 unsigned int str_size, grn_dat_scan_hit *scan_hits,
                 unsigned int max_num_scan_hits, const char **str_rest) {
        if (!grn_dat_open_trie_if_needed(ctx, dat) || !str ||
                !(dat->obj.header.flags & GRN_OBJ_KEY_VAR_SIZE) || !scan_hits) {
            return -1;
        }

        grn::dat::Trie * const trie = static_cast<grn::dat::Trie *>(dat->trie);
        if (!trie) {
            return -1;
        }

        if (!max_num_scan_hits || !str_size) {
            if (str_rest) {
                *str_rest = str;
            }
            return 0;
        }

        int num_scan_hits = 0;
        try {
            if (dat->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) {
                grn_obj *normalizer = GRN_NORMALIZER_AUTO;
                int flags = GRN_STRING_WITH_CHECKS;
                grn_obj * const normalized_string = grn_string_open(ctx, str, str_size,
                                                    normalizer,
                                                    flags);
                if (!normalized_string) {
                    fprintf(stderr, "error: grn_string_open() failed!\n");
                    return -1;
                }
                grn_string_get_normalized(ctx, normalized_string, &str, &str_size, NULL);
                const short *checks = grn_string_get_checks(ctx, normalized_string);
                unsigned int offset = 0;
                while (str_size) {
                    if (*checks) {
                        grn::dat::UInt32 key_pos;
                        if (trie->lcp_search(str, str_size, &key_pos)) {
                            const grn::dat::Key &key = trie->get_key(key_pos);
                            const grn::dat::UInt32 key_length = key.length();
                            if ((key_length == str_size) || (checks[key_length])) {
                                unsigned int length = 0;
                                for (grn::dat::UInt32 i = 0; i < key_length; ++i) {
                                    if (checks[i] > 0) {
                                        length += checks[i];
                                    }
                                }
                                scan_hits[num_scan_hits].id = key.id();
                                scan_hits[num_scan_hits].offset = offset;
                                scan_hits[num_scan_hits].length = length;
                                offset += length;
                                str += key_length;
                                str_size -= key_length;
                                checks += key_length;
                                if (++num_scan_hits >= max_num_scan_hits) {
                                    break;
                                }
                                continue;
                            }
                        }
                        if (*checks > 0) {
                            offset += *checks;
                        }
                    }
                    ++str;
                    --str_size;
                    ++checks;
                }
                if (str_rest) {
                    grn_string_get_original(ctx, normalized_string, str_rest, NULL);
                    *str_rest += offset;
                }
                grn_obj_close(ctx, normalized_string);
            } else {
                const char * const begin = str;
                while (str_size) {
                    grn::dat::UInt32 key_pos;
                    if (trie->lcp_search(str, str_size, &key_pos)) {
                        const grn::dat::Key &key = trie->get_key(key_pos);
                        scan_hits[num_scan_hits].id = key.id();
                        scan_hits[num_scan_hits].offset = str - begin;
                        scan_hits[num_scan_hits].length = key.length();
                        str += key.length();
                        str_size -= key.length();
                        if (++num_scan_hits >= max_num_scan_hits) {
                            break;
                        }
                    } else {
                        const int char_length = grn_charlen(ctx, str, str + str_size);
                        if (char_length) {
                            str += char_length;
                            str_size -= char_length;
                        } else {
                            ++str;
                            --str_size;
                        }
                    }
                }
                if (str_rest) {
                    *str_rest = str;
                }
            }
        } catch (const grn::dat::Exception &ex) {
            ERR(grn_dat_translate_error_code(ex.code()),
                "grn::dat::lcp_search failed");
            return -1;
        }
        return num_scan_hits;
    }
Exemplo n.º 22
0
grn_token *
grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len,
               grn_token_mode mode, unsigned int flags)
{
  grn_token *token;
  grn_encoding encoding;
  grn_obj *tokenizer;
  grn_obj *normalizer;
  grn_obj_flags table_flags;
  if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer,
                         &normalizer)) {
    return NULL;
  }
  if (!(token = GRN_MALLOC(sizeof(grn_token)))) { return NULL; }
  token->table = table;
  token->mode = mode;
  token->encoding = encoding;
  token->tokenizer = tokenizer;
  token->orig = str;
  token->orig_blen = str_len;
  token->curr = NULL;
  token->nstr = NULL;
  token->curr_size = 0;
  token->pos = -1;
  token->status = GRN_TOKEN_DOING;
  token->force_prefix = 0;
  if (tokenizer) {
    grn_obj str_, flags_;
    GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY);
    GRN_TEXT_SET_REF(&str_, str, str_len);
    GRN_UINT32_INIT(&flags_, 0);
    GRN_UINT32_SET(ctx, &flags_, flags);
    token->pctx.caller = NULL;
    token->pctx.user_data.ptr = NULL;
    token->pctx.proc = (grn_proc *)tokenizer;
    token->pctx.hooks = NULL;
    token->pctx.currh = NULL;
    token->pctx.phase = PROC_INIT;
    grn_ctx_push(ctx, &str_);
    grn_ctx_push(ctx, &flags_);
    ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token->pctx.user_data);
    grn_obj_close(ctx, &flags_);
    grn_obj_close(ctx, &str_);
  } else {
    int nflags = 0;
    token->nstr = grn_string_open_(ctx, str, str_len,
                                   normalizer, nflags, token->encoding);
    if (token->nstr) {
      const char *normalized;
      grn_string_get_normalized(ctx, token->nstr,
                                &normalized, &(token->curr_size), NULL);
      token->curr = (const unsigned char *)normalized;
    } else {
      ERR(GRN_TOKENIZER_ERROR, "grn_string_open failed at grn_token_open");
    }
  }
  if (ctx->rc) {
    grn_token_close(ctx, token);
    token = NULL;
  }
  return token;
}
Exemplo n.º 23
0
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_mecab_tokenizer *tokenizer;
  unsigned int normalizer_flags = 0;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
  if (!query) {
    return NULL;
  }
  if (!sole_mecab) {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_new2() failed on mecab_init(): %s",
                         mecab_global_error_message());
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
  }
  if (!sole_mecab) {
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }

  if (query->encoding != sole_mecab_encoding) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "MeCab dictionary charset (%s) does not match "
                     "the table encoding: <%s>",
                     grn_encoding_to_string(sole_mecab_encoding),
                     grn_encoding_to_string(query->encoding));
    return NULL;
  }

  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][mecab] "
                     "memory allocation to grn_mecab_tokenizer failed");
    return NULL;
  }
  tokenizer->mecab = sole_mecab;
  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  GRN_TEXT_INIT(&(tokenizer->buf), 0);
  if (query->have_tokenized_delimiter) {
    tokenizer->next = normalized_string;
    tokenizer->end = tokenizer->next + normalized_string_length;
  } else if (normalized_string_length == 0) {
    tokenizer->next = "";
    tokenizer->end = tokenizer->next;
  } else {
    grn_bool succeeded;
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (grn_mecab_chunked_tokenize_enabled &&
        ctx->encoding == GRN_ENC_UTF8) {
      succeeded = chunked_tokenize_utf8(ctx,
                                        tokenizer,
                                        normalized_string,
                                        normalized_string_length);
    } else {
      const char *s;
      s = mecab_sparse_tostr2(tokenizer->mecab,
                              normalized_string,
                              normalized_string_length);
      if (!s) {
        succeeded = GRN_FALSE;
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_sparse_tostr() failed len=%d err=%s",
                         normalized_string_length,
                         mecab_strerror(tokenizer->mecab));
      } else {
        succeeded = GRN_TRUE;
        GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
    if (!succeeded) {
      grn_tokenizer_query_close(ctx, tokenizer->query);
      GRN_PLUGIN_FREE(ctx, tokenizer);
      return NULL;
    }
    {
      char *buf, *p;
      unsigned int bufsize;

      buf = GRN_TEXT_VALUE(&(tokenizer->buf));
      bufsize = GRN_TEXT_LEN(&(tokenizer->buf));
      /* A certain version of mecab returns trailing lf or spaces. */
      for (p = buf + bufsize - 2;
           buf <= p && isspace(*(unsigned char *)p);
           p--) { *p = '\0'; }
      tokenizer->next = buf;
      tokenizer->end = p + 1;
    }
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}
Exemplo n.º 24
0
static grn_obj *
yangram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data,
             unsigned short ngram_unit, grn_bool ignore_blank,
             grn_bool split_symbol, grn_bool split_alpha, grn_bool split_digit,
             grn_bool skip_overlap, unsigned short use_vgram)
{
  grn_tokenizer_query *query;
  unsigned int normalize_flags =
    GRN_STRING_WITH_TYPES |
    GRN_STRING_REMOVE_TOKENIZED_DELIMITER |
    GRN_STRING_REMOVE_BLANK;

  const char *normalized;
  unsigned int normalized_length_in_bytes;
  grn_yangram_tokenizer *tokenizer;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
  if (!query) {
    return NULL;
  }
  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_yangram_tokenizer)))) {
    GRN_PLUGIN_ERROR(ctx,GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][yangram] "
                     "memory allocation to grn_yangram_tokenizer failed");
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }
  user_data->ptr = tokenizer;
  grn_tokenizer_token_init(ctx, &(tokenizer->token));
  tokenizer->query = query;
  tokenizer->skip_overlap = skip_overlap;
  tokenizer->ignore_blank = ignore_blank;
  tokenizer->ngram_unit = ngram_unit;
  tokenizer->split_symbol = split_symbol;
  tokenizer->split_alpha = split_alpha;
  tokenizer->split_digit = split_digit;
  tokenizer->use_vgram = use_vgram;
  if (tokenizer->use_vgram > 0) {
    const char *vgram_word_table_name_env;
    vgram_word_table_name_env = getenv("GRN_VGRAM_WORD_TABLE_NAME");

    if (vgram_word_table_name_env) {
      tokenizer->vgram_table = grn_ctx_get(ctx,
                                           vgram_word_table_name_env,
                                           strlen(vgram_word_table_name_env));
    } else {
      tokenizer->vgram_table = grn_ctx_get(ctx,
                                           VGRAM_WORD_TABLE_NAME,
                                           strlen(VGRAM_WORD_TABLE_NAME));
    }
    if (!tokenizer->vgram_table) {
       GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                        "[tokenizer][yangram] "
                        "couldn't open a vgram table");
       tokenizer->vgram_table = NULL;
       return NULL;
    }
  } else {
    tokenizer->vgram_table = NULL;
  }
  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
                            &normalized, &normalized_length_in_bytes,
                            NULL);
  {
    const char *phrase_table_name_env;
    phrase_table_name_env = getenv("GRN_KNOWN_PHRASE_TABLE_NAME");

    if (phrase_table_name_env) {
      tokenizer->phrase_table = grn_ctx_get(ctx,
                                            phrase_table_name_env,
                                            strlen(phrase_table_name_env));
    } else {
      tokenizer->phrase_table = grn_ctx_get(ctx,
                                            KNOWN_PHRASE_TABLE_NAME,
                                            strlen(KNOWN_PHRASE_TABLE_NAME));
    }
    if (tokenizer->phrase_table) {
      if (!(tokenizer->hits =
          GRN_PLUGIN_MALLOC(ctx, sizeof(grn_pat_scan_hit) * MAX_N_HITS))) {
        GRN_PLUGIN_ERROR(ctx,GRN_NO_MEMORY_AVAILABLE,
                        "[tokenizer][yangram] "
                        "memory allocation to grn_pat_scan_hit failed");
        grn_tokenizer_query_close(ctx, query);
        return NULL;
      } else {
        tokenizer->scan_rest = normalized;
        tokenizer->nhits = 0;
        tokenizer->current_hit = 0;
      }
    } else {
     tokenizer->phrase_table = NULL;
    }
  }

  tokenizer->next = (const unsigned char *)normalized;
  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
  tokenizer->rest_length = tokenizer->end - tokenizer->next;
  tokenizer->ctypes =
    grn_string_get_types(ctx, tokenizer->query->normalized_query);

  tokenizer->pushed_token_tail = NULL;
  tokenizer->ctypes_next = 0;

  return NULL;
}