Exemple #1
0
static void
grn_tokenizer_query_ensure_have_tokenized_delimiter(grn_ctx *ctx,
                                                    grn_tokenizer_query *query)
{
  grn_tokenizer_query_ensure_normalized(ctx, query);

  if (!query->need_delimiter_check) {
    return;
  }

  query->need_delimiter_check = GRN_FALSE;

  if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) {
    const char *normalized_string;
    unsigned int normalized_string_length;

    grn_string_get_normalized(ctx,
                              query->normalized_query,
                              &normalized_string,
                              &normalized_string_length,
                              NULL);
    query->have_tokenized_delimiter =
      grn_tokenizer_have_tokenized_delimiter(ctx,
                                             normalized_string,
                                             normalized_string_length,
                                             query->encoding);
  } else {
    query->have_tokenized_delimiter = GRN_FALSE;
  }
}
Exemple #2
0
static grn_obj *
delimited_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data,
               const uint8_t *delimiter, uint32_t delimiter_len)
{
  grn_tokenizer_query *query;
  unsigned int normalize_flags = 0;
  const char *normalized;
  unsigned int normalized_length_in_bytes;
  grn_delimited_tokenizer *tokenizer;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
  if (!query) {
    return NULL;
  }

  if (!(tokenizer = GRN_MALLOC(sizeof(grn_delimited_tokenizer)))) {
    ERR(GRN_NO_MEMORY_AVAILABLE,
        "[tokenizer][delimit] "
        "memory allocation to grn_delimited_tokenizer failed");
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }
  user_data->ptr = tokenizer;

  tokenizer->query = query;

  tokenizer->have_tokenized_delimiter =
    grn_tokenizer_have_tokenized_delimiter(ctx,
                                           tokenizer->query->ptr,
                                           tokenizer->query->length,
                                           tokenizer->query->encoding);
  tokenizer->delimiter = delimiter;
  tokenizer->delimiter_len = delimiter_len;
  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
                            &normalized, &normalized_length_in_bytes,
                            NULL);
  tokenizer->next = (const unsigned char *)normalized;
  tokenizer->end = tokenizer->next + normalized_length_in_bytes;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}
Exemple #3
0
grn_tokenizer_query *
grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
                         unsigned int normalize_flags)
{
  grn_obj *flags = grn_ctx_pop(ctx);
  grn_obj *query_str = grn_ctx_pop(ctx);
  grn_obj *tokenize_mode = grn_ctx_pop(ctx);

  if (query_str == NULL) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument");
    return NULL;
  }

  if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer");
    return NULL;
  }

  {
    grn_tokenizer_query * const query =
        GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query));
    if (query == NULL) {
      return NULL;
    }
    query->normalized_query = NULL;
    query->query_buf = NULL;
    if (flags) {
      query->flags = GRN_UINT32_VALUE(flags);
    } else {
      query->flags = 0;
    }
    if (tokenize_mode) {
      query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode);
    } else {
      query->tokenize_mode = GRN_TOKENIZE_ADD;
    }
    query->token_mode = query->tokenize_mode;

    {
      grn_obj * const table = args[0];
      grn_obj_flags table_flags;
      grn_encoding table_encoding;
      unsigned int query_length = GRN_TEXT_LEN(query_str);
      char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1);
      grn_obj *normalizer = NULL;

      if (query_buf == NULL) {
        GRN_PLUGIN_FREE(ctx, query);
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer] failed to duplicate query");
        return NULL;
      }
      grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL,
                         &normalizer, NULL);
      {
        grn_obj *normalized_query;
        if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
          normalizer = GRN_NORMALIZER_AUTO;
        }
        normalized_query = grn_string_open_(ctx,
                                            GRN_TEXT_VALUE(query_str),
                                            GRN_TEXT_LEN(query_str),
                                            normalizer,
                                            normalize_flags,
                                            table_encoding);
        if (!normalized_query) {
          GRN_PLUGIN_FREE(ctx, query_buf);
          GRN_PLUGIN_FREE(ctx, query);
          GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                           "[tokenizer] failed to open normalized string");
          return NULL;
        }
        query->normalized_query = normalized_query;
        grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length);
        query_buf[query_length] = '\0';
        query->query_buf = query_buf;
        query->ptr = query_buf;
        query->length = query_length;
      }
      query->encoding = table_encoding;

      if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) {
        const char *normalized_string;
        unsigned int normalized_string_length;

        grn_string_get_normalized(ctx,
                                  query->normalized_query,
                                  &normalized_string,
                                  &normalized_string_length,
                                  NULL);
        query->have_tokenized_delimiter =
          grn_tokenizer_have_tokenized_delimiter(ctx,
                                                 normalized_string,
                                                 normalized_string_length,
                                                 query->encoding);
      } else {
        query->have_tokenized_delimiter = GRN_FALSE;
      }
    }
    return query;
  }
}
Exemple #4
0
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  char *buf, *p;
  const char *s;
  grn_mecab_tokenizer *tokenizer;
  unsigned int bufsize;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args);
  if (!query) {
    return NULL;
  }
  if (!sole_mecab) {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_new2() failed on mecab_init(): %s",
                         mecab_strerror(NULL));
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
  }
  if (!sole_mecab) {
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }

  if (query->encoding != sole_mecab_encoding) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "MeCab dictionary charset (%s) does not match "
                     "the table encoding: <%s>",
                     grn_enctostr(sole_mecab_encoding),
                     grn_enctostr(query->encoding));
    return NULL;
  }

  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][mecab] "
                     "memory allocation to grn_mecab_tokenizer failed");
    return NULL;
  }
  tokenizer->mecab = sole_mecab;
  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  tokenizer->have_tokenized_delimiter =
    grn_tokenizer_have_tokenized_delimiter(ctx,
                                           normalized_string,
                                           normalized_string_length,
                                           query->encoding);

  if (tokenizer->have_tokenized_delimiter) {
    tokenizer->buf = NULL;
    tokenizer->next = normalized_string;
    tokenizer->end = tokenizer->next + normalized_string_length;
  } else {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    s = mecab_sparse_tostr2(tokenizer->mecab,
                            normalized_string,
                            normalized_string_length);
    if (!s) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                       "[tokenizer][mecab] "
                       "mecab_sparse_tostr() failed len=%d err=%s",
                       normalized_string_length,
                       mecab_strerror(tokenizer->mecab));
    } else {
      bufsize = strlen(s) + 1;
      if (!(buf = GRN_PLUGIN_MALLOC(ctx, bufsize))) {
        GRN_PLUGIN_LOG(ctx, GRN_LOG_ALERT,
                       "[tokenizer][mecab] "
                       "buffer allocation on mecab_init failed !");
      } else {
        memcpy(buf, s, bufsize);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
    if (!s || !buf) {
      grn_tokenizer_query_close(ctx, tokenizer->query);
      GRN_PLUGIN_FREE(ctx, tokenizer);
      return NULL;
    }
    /* A certain version of mecab returns trailing lf or spaces. */
    for (p = buf + bufsize - 2;
         buf <= p && isspace(*(unsigned char *)p);
         p--) { *p = '\0'; }
    tokenizer->buf = buf;
    tokenizer->next = buf;
    tokenizer->end = p + 1;
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}