示例#1
0
/*
  This function finalizes a tokenization.
 */
static grn_obj *
mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_mecab_tokenizer *tokenizer = user_data->ptr;
  grn_tokenizer_token_fin(ctx, &(tokenizer->token));
  grn_tokenizer_query_close(ctx, tokenizer->query);
  if (tokenizer->buf) {
    GRN_PLUGIN_FREE(ctx, tokenizer->buf);
  }
  GRN_PLUGIN_FREE(ctx, tokenizer);
  return NULL;
}
示例#2
0
void
grn_tokenizer_query_close(grn_ctx *ctx, grn_tokenizer_query *query)
{
  if (query != NULL) {
    if (query->normalized_query != NULL) {
      grn_obj_unlink(ctx, query->normalized_query);
    }
    if (query->query_buf != NULL) {
      GRN_PLUGIN_FREE(ctx, query->query_buf);
    }
    GRN_PLUGIN_FREE(ctx, query);
  }
}
示例#3
0
grn_rc
grn_tokenizer_query_set_raw_string(grn_ctx *ctx,
                                   grn_tokenizer_query *query,
                                   const char *string,
                                   size_t string_length)
{
  GRN_API_ENTER;

  if (query->query_buf) {
    GRN_PLUGIN_FREE(ctx, query->query_buf);
  }

  if (string_length == 0) {
    query->query_buf = NULL;
    query->ptr = NULL;
    query->length = 0;
    query->need_normalize = GRN_TRUE;
  } else {
    query->query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, string_length + 1);
    if (!query->query_buf) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                       "[tokenizer][query] failed to duplicate query");
      GRN_API_RETURN(ctx->rc);
    }
    grn_memcpy(query->query_buf, string, string_length);
    query->query_buf[string_length] = '\0';
    query->ptr = query->query_buf;
    query->length = string_length;
  }

  GRN_API_RETURN(ctx->rc);
}
示例#4
0
void
grn_tokenizer_query_close(grn_ctx *ctx, grn_tokenizer_query *query)
{
  GRN_API_ENTER;
  if (query) {
    grn_tokenizer_query_fin(ctx, query);
    GRN_PLUGIN_FREE(ctx, query);
  }
  GRN_API_RETURN();
}
示例#5
0
grn_tokenizer_query *
grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
                         unsigned int normalize_flags)
{
  grn_obj *flags;
  grn_obj *query_str;
  grn_obj *tokenize_mode;

  GRN_API_ENTER;

  flags = grn_ctx_pop(ctx);
  query_str = grn_ctx_pop(ctx);
  tokenize_mode = grn_ctx_pop(ctx);

  if (query_str == NULL) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument");
    GRN_API_RETURN(NULL);
  }

  if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer");
    GRN_API_RETURN(NULL);
  }

  {
    grn_tokenizer_query * const query =
        GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query));
    if (!query) {
      GRN_API_RETURN(NULL);
    }
    grn_tokenizer_query_init(ctx, query);
    grn_tokenizer_query_set_raw_string(ctx,
                                       query,
                                       GRN_TEXT_VALUE(query_str),
                                       GRN_TEXT_LEN(query_str));
    if (ctx->rc != GRN_SUCCESS) {
      GRN_PLUGIN_FREE(ctx, query);
      GRN_API_RETURN(NULL);
    }
    if (flags) {
      grn_tokenizer_query_set_flags(ctx, query, GRN_UINT32_VALUE(flags));
    }
    if (tokenize_mode) {
      grn_tokenizer_query_set_mode(ctx, query, GRN_UINT32_VALUE(tokenize_mode));
    }
    grn_tokenizer_query_set_normalize_flags(ctx, query, normalize_flags);
    grn_tokenizer_query_set_lexicon(ctx, query, args[0]);

    grn_tokenizer_query_ensure_have_tokenized_delimiter(ctx, query);

    GRN_API_RETURN(query);
  }
}
示例#6
0
static grn_obj *
yangram_fin(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args,
            grn_user_data *user_data)
{
  grn_yangram_tokenizer *tokenizer = user_data->ptr;

  if (!tokenizer) {
    return NULL;
  }
  if (tokenizer->vgram_table) {
    grn_obj_unlink(ctx, tokenizer->vgram_table);
  }

  if (tokenizer->phrase_table) {
    grn_obj_unlink(ctx, tokenizer->phrase_table);
    GRN_PLUGIN_FREE(ctx, tokenizer->hits);
  }
  grn_tokenizer_query_close(ctx, tokenizer->query);
  grn_tokenizer_token_fin(ctx, &(tokenizer->token));
  GRN_PLUGIN_FREE(ctx, tokenizer);
  return NULL;
}
示例#7
0
static inline score_heap *
score_heap_open(grn_ctx *ctx, int max)
{
  score_heap *h = GRN_PLUGIN_MALLOC(ctx, sizeof(score_heap));
  if (!h) { return NULL; }
  h->nodes = GRN_PLUGIN_MALLOC(ctx, sizeof(score_heap_node) * max);
  if (!h->nodes) {
    GRN_PLUGIN_FREE(ctx, h);
    return NULL;
  }
  h->n_entries = 0;
  h->limit = max;
  return h;
}
示例#8
0
文件: stem.c 项目: tamano/groonga
static void
stem_fin(grn_ctx *ctx, void *user_data)
{
  grn_stem_token_filter *token_filter = user_data;
  if (!token_filter) {
    return;
  }

  grn_tokenizer_token_fin(ctx, &(token_filter->token));
  if (token_filter->stemmer) {
    sb_stemmer_delete(token_filter->stemmer);
  }
  GRN_PLUGIN_FREE(ctx, token_filter);
}
示例#9
0
static void
command_schema_column_output_indexes(grn_ctx *ctx, grn_obj *column)
{
  uint32_t i;
  grn_index_datum *index_data = NULL;
  uint32_t n_index_data = 0;

  if (column) {
    n_index_data = grn_column_get_all_index_data(ctx, column, NULL, 0);
    if (n_index_data > 0) {
      index_data = GRN_PLUGIN_MALLOC(ctx,
                                     sizeof(grn_index_datum) * n_index_data);
      if (!index_data) {
        GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                         "[schema] failed to allocate memory for indexes");
        return;
      }
      grn_column_get_all_index_data(ctx, column, index_data, n_index_data);
    }
  }

  grn_ctx_output_array_open(ctx, "indexes", n_index_data);
  for (i = 0; i < n_index_data; i++) {
    grn_obj *lexicon;

    grn_ctx_output_map_open(ctx, "index", 4);

    grn_ctx_output_cstr(ctx, "full_name");
    command_schema_output_name(ctx, index_data[i].index);

    grn_ctx_output_cstr(ctx, "table");
    lexicon = grn_ctx_at(ctx, index_data[i].index->header.domain);
    command_schema_output_name(ctx, lexicon);

    grn_ctx_output_cstr(ctx, "name");
    command_schema_output_column_name(ctx, index_data[i].index);

    grn_ctx_output_cstr(ctx, "section");
    grn_ctx_output_uint64(ctx, index_data[i].section);

    grn_ctx_output_map_close(ctx);
  }
  grn_ctx_output_array_close(ctx);

  if (index_data) {
    GRN_PLUGIN_FREE(ctx, index_data);
  }
}
示例#10
0
static grn_obj *
sample_fin(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args,
           grn_user_data *user_data)
{
  sample_tokenizer *tokenizer = user_data->ptr;

  if (!tokenizer) {
    return NULL;
  }

  grn_tokenizer_token_fin(ctx, &(tokenizer->token));
  grn_tokenizer_query_close(ctx, tokenizer->query);
  GRN_PLUGIN_FREE(ctx, tokenizer);

  return NULL;
}
示例#11
0
static uint32_t
calc_edit_distance(grn_ctx *ctx, char *sx, char *ex, char *sy, char *ey, int flags)
{
  int d = 0;
  uint32_t cx, lx, cy, ly, *dists;
  char *px, *py;
  for (px = sx, lx = 0; px < ex && (cx = grn_charlen(ctx, px, ex)); px += cx, lx++);
  for (py = sy, ly = 0; py < ey && (cy = grn_charlen(ctx, py, ey)); py += cy, ly++);
  if ((dists = GRN_PLUGIN_MALLOC(ctx, (lx + 1) * (ly + 1) * sizeof(uint32_t)))) {
    uint32_t x, y;
    for (x = 0; x <= lx; x++) { DIST(x, 0) = x; }
    for (y = 0; y <= ly; y++) { DIST(0, y) = y; }
    for (x = 1, px = sx; x <= lx; x++, px += cx) {
      cx = grn_charlen(ctx, px, ex);
      for (y = 1, py = sy; y <= ly; y++, py += cy) {
        cy = grn_charlen(ctx, py, ey);
        if (cx == cy && !memcmp(px, py, cx)) {
          DIST(x, y) = DIST(x - 1, y - 1);
        } else {
          uint32_t a = DIST(x - 1, y) + 1;
          uint32_t b = DIST(x, y - 1) + 1;
          uint32_t c = DIST(x - 1, y - 1) + 1;
          DIST(x, y) = ((a < b) ? ((a < c) ? a : c) : ((b < c) ? b : c));
          if (flags & GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION &&
              x > 1 && y > 1 && cx == cy &&
              memcmp(px, py - cy, cx) == 0 &&
              memcmp(px - cx, py, cx) == 0) {
            uint32_t t = DIST(x - 2, y - 2) + 1;
            DIST(x, y) = ((DIST(x, y) < t) ? DIST(x, y) : t);
          }
        }
      }
    }
    d = DIST(lx, ly);
    GRN_PLUGIN_FREE(ctx, dists);
  }
  return d;
}
示例#12
0
static inline void
score_heap_close(grn_ctx *ctx, score_heap *h)
{
  GRN_PLUGIN_FREE(ctx, h->nodes);
  GRN_PLUGIN_FREE(ctx, h);
}
示例#13
0
grn_tokenizer_query *
grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
                         unsigned int normalize_flags)
{
  grn_obj *flags = grn_ctx_pop(ctx);
  grn_obj *query_str = grn_ctx_pop(ctx);
  grn_obj *tokenize_mode = grn_ctx_pop(ctx);

  if (query_str == NULL) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument");
    return NULL;
  }

  if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer");
    return NULL;
  }

  {
    grn_tokenizer_query * const query =
        GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query));
    if (query == NULL) {
      return NULL;
    }
    query->normalized_query = NULL;
    query->query_buf = NULL;
    if (flags) {
      query->flags = GRN_UINT32_VALUE(flags);
    } else {
      query->flags = 0;
    }
    if (tokenize_mode) {
      query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode);
    } else {
      query->tokenize_mode = GRN_TOKENIZE_ADD;
    }
    query->token_mode = query->tokenize_mode;

    {
      grn_obj * const table = args[0];
      grn_obj_flags table_flags;
      grn_encoding table_encoding;
      unsigned int query_length = GRN_TEXT_LEN(query_str);
      char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1);
      grn_obj *normalizer = NULL;

      if (query_buf == NULL) {
        GRN_PLUGIN_FREE(ctx, query);
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer] failed to duplicate query");
        return NULL;
      }
      grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL,
                         &normalizer, NULL);
      {
        grn_obj *normalized_query;
        if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
          normalizer = GRN_NORMALIZER_AUTO;
        }
        normalized_query = grn_string_open_(ctx,
                                            GRN_TEXT_VALUE(query_str),
                                            GRN_TEXT_LEN(query_str),
                                            normalizer,
                                            normalize_flags,
                                            table_encoding);
        if (!normalized_query) {
          GRN_PLUGIN_FREE(ctx, query_buf);
          GRN_PLUGIN_FREE(ctx, query);
          GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                           "[tokenizer] failed to open normalized string");
          return NULL;
        }
        query->normalized_query = normalized_query;
        grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length);
        query_buf[query_length] = '\0';
        query->query_buf = query_buf;
        query->ptr = query_buf;
        query->length = query_length;
      }
      query->encoding = table_encoding;

      if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) {
        const char *normalized_string;
        unsigned int normalized_string_length;

        grn_string_get_normalized(ctx,
                                  query->normalized_query,
                                  &normalized_string,
                                  &normalized_string_length,
                                  NULL);
        query->have_tokenized_delimiter =
          grn_tokenizer_have_tokenized_delimiter(ctx,
                                                 normalized_string,
                                                 normalized_string_length,
                                                 query->encoding);
      } else {
        query->have_tokenized_delimiter = GRN_FALSE;
      }
    }
    return query;
  }
}
示例#14
0
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_mecab_tokenizer *tokenizer;
  unsigned int normalizer_flags = 0;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
  if (!query) {
    return NULL;
  }
  if (!sole_mecab) {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_new2() failed on mecab_init(): %s",
                         mecab_global_error_message());
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
  }
  if (!sole_mecab) {
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }

  if (query->encoding != sole_mecab_encoding) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "MeCab dictionary charset (%s) does not match "
                     "the table encoding: <%s>",
                     grn_encoding_to_string(sole_mecab_encoding),
                     grn_encoding_to_string(query->encoding));
    return NULL;
  }

  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][mecab] "
                     "memory allocation to grn_mecab_tokenizer failed");
    return NULL;
  }
  tokenizer->mecab = sole_mecab;
  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  GRN_TEXT_INIT(&(tokenizer->buf), 0);
  if (query->have_tokenized_delimiter) {
    tokenizer->next = normalized_string;
    tokenizer->end = tokenizer->next + normalized_string_length;
  } else if (normalized_string_length == 0) {
    tokenizer->next = "";
    tokenizer->end = tokenizer->next;
  } else {
    grn_bool succeeded;
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (grn_mecab_chunked_tokenize_enabled &&
        ctx->encoding == GRN_ENC_UTF8) {
      succeeded = chunked_tokenize_utf8(ctx,
                                        tokenizer,
                                        normalized_string,
                                        normalized_string_length);
    } else {
      const char *s;
      s = mecab_sparse_tostr2(tokenizer->mecab,
                              normalized_string,
                              normalized_string_length);
      if (!s) {
        succeeded = GRN_FALSE;
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_sparse_tostr() failed len=%d err=%s",
                         normalized_string_length,
                         mecab_strerror(tokenizer->mecab));
      } else {
        succeeded = GRN_TRUE;
        GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
    if (!succeeded) {
      grn_tokenizer_query_close(ctx, tokenizer->query);
      GRN_PLUGIN_FREE(ctx, tokenizer);
      return NULL;
    }
    {
      char *buf, *p;
      unsigned int bufsize;

      buf = GRN_TEXT_VALUE(&(tokenizer->buf));
      bufsize = GRN_TEXT_LEN(&(tokenizer->buf));
      /* A certain version of mecab returns trailing lf or spaces. */
      for (p = buf + bufsize - 2;
           buf <= p && isspace(*(unsigned char *)p);
           p--) { *p = '\0'; }
      tokenizer->next = buf;
      tokenizer->end = p + 1;
    }
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}
示例#15
0
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  char *buf, *p;
  const char *s;
  grn_mecab_tokenizer *tokenizer;
  unsigned int bufsize;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args);
  if (!query) {
    return NULL;
  }
  if (!sole_mecab) {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_new2() failed on mecab_init(): %s",
                         mecab_strerror(NULL));
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
  }
  if (!sole_mecab) {
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }

  if (query->encoding != sole_mecab_encoding) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "MeCab dictionary charset (%s) does not match "
                     "the table encoding: <%s>",
                     grn_enctostr(sole_mecab_encoding),
                     grn_enctostr(query->encoding));
    return NULL;
  }

  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][mecab] "
                     "memory allocation to grn_mecab_tokenizer failed");
    return NULL;
  }
  tokenizer->mecab = sole_mecab;
  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  tokenizer->have_tokenized_delimiter =
    grn_tokenizer_have_tokenized_delimiter(ctx,
                                           normalized_string,
                                           normalized_string_length,
                                           query->encoding);

  if (tokenizer->have_tokenized_delimiter) {
    tokenizer->buf = NULL;
    tokenizer->next = normalized_string;
    tokenizer->end = tokenizer->next + normalized_string_length;
  } else {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    s = mecab_sparse_tostr2(tokenizer->mecab,
                            normalized_string,
                            normalized_string_length);
    if (!s) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                       "[tokenizer][mecab] "
                       "mecab_sparse_tostr() failed len=%d err=%s",
                       normalized_string_length,
                       mecab_strerror(tokenizer->mecab));
    } else {
      bufsize = strlen(s) + 1;
      if (!(buf = GRN_PLUGIN_MALLOC(ctx, bufsize))) {
        GRN_PLUGIN_LOG(ctx, GRN_LOG_ALERT,
                       "[tokenizer][mecab] "
                       "buffer allocation on mecab_init failed !");
      } else {
        memcpy(buf, s, bufsize);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
    if (!s || !buf) {
      grn_tokenizer_query_close(ctx, tokenizer->query);
      GRN_PLUGIN_FREE(ctx, tokenizer);
      return NULL;
    }
    /* A certain version of mecab returns trailing lf or spaces. */
    for (p = buf + bufsize - 2;
         buf <= p && isspace(*(unsigned char *)p);
         p--) { *p = '\0'; }
    tokenizer->buf = buf;
    tokenizer->next = buf;
    tokenizer->end = p + 1;
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}