Ejemplo n.º 1
0
static grn_bool
chunked_tokenize_utf8_chunk(grn_ctx *ctx,
                            grn_mecab_tokenizer *tokenizer,
                            const char *chunk,
                            unsigned int chunk_bytes)
{
  const char *tokenized_chunk;
  size_t tokenized_chunk_length;

  tokenized_chunk = mecab_sparse_tostr2(tokenizer->mecab, chunk, chunk_bytes);
  if (!tokenized_chunk) {
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab][chunk] "
                     "mecab_sparse_tostr2() failed len=%d err=%s",
                     chunk_bytes,
                     mecab_strerror(tokenizer->mecab));
    return GRN_FALSE;
  }

  if (GRN_TEXT_LEN(&(tokenizer->buf)) > 0) {
    GRN_TEXT_PUTS(ctx, &(tokenizer->buf), " ");
  }

  tokenized_chunk_length = strlen(tokenized_chunk);
  if (tokenized_chunk_length >= 1 &&
      isspace(tokenized_chunk[tokenized_chunk_length - 1])) {
    GRN_TEXT_PUT(ctx, &(tokenizer->buf),
                 tokenized_chunk, tokenized_chunk_length - 1);
  } else {
    GRN_TEXT_PUT(ctx, &(tokenizer->buf),
                 tokenized_chunk, tokenized_chunk_length);
  }

  return GRN_TRUE;
}
Ejemplo n.º 2
0
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_mecab_tokenizer *tokenizer;
  unsigned int normalizer_flags = 0;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
  if (!query) {
    return NULL;
  }
  if (!sole_mecab) {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_new2() failed on mecab_init(): %s",
                         mecab_global_error_message());
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
  }
  if (!sole_mecab) {
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }

  if (query->encoding != sole_mecab_encoding) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "MeCab dictionary charset (%s) does not match "
                     "the table encoding: <%s>",
                     grn_encoding_to_string(sole_mecab_encoding),
                     grn_encoding_to_string(query->encoding));
    return NULL;
  }

  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][mecab] "
                     "memory allocation to grn_mecab_tokenizer failed");
    return NULL;
  }
  tokenizer->mecab = sole_mecab;
  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  GRN_TEXT_INIT(&(tokenizer->buf), 0);
  if (query->have_tokenized_delimiter) {
    tokenizer->next = normalized_string;
    tokenizer->end = tokenizer->next + normalized_string_length;
  } else if (normalized_string_length == 0) {
    tokenizer->next = "";
    tokenizer->end = tokenizer->next;
  } else {
    grn_bool succeeded;
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (grn_mecab_chunked_tokenize_enabled &&
        ctx->encoding == GRN_ENC_UTF8) {
      succeeded = chunked_tokenize_utf8(ctx,
                                        tokenizer,
                                        normalized_string,
                                        normalized_string_length);
    } else {
      const char *s;
      s = mecab_sparse_tostr2(tokenizer->mecab,
                              normalized_string,
                              normalized_string_length);
      if (!s) {
        succeeded = GRN_FALSE;
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_sparse_tostr() failed len=%d err=%s",
                         normalized_string_length,
                         mecab_strerror(tokenizer->mecab));
      } else {
        succeeded = GRN_TRUE;
        GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
    if (!succeeded) {
      grn_tokenizer_query_close(ctx, tokenizer->query);
      GRN_PLUGIN_FREE(ctx, tokenizer);
      return NULL;
    }
    {
      char *buf, *p;
      unsigned int bufsize;

      buf = GRN_TEXT_VALUE(&(tokenizer->buf));
      bufsize = GRN_TEXT_LEN(&(tokenizer->buf));
      /* A certain version of mecab returns trailing lf or spaces. */
      for (p = buf + bufsize - 2;
           buf <= p && isspace(*(unsigned char *)p);
           p--) { *p = '\0'; }
      tokenizer->next = buf;
      tokenizer->end = p + 1;
    }
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}
Ejemplo n.º 3
0
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  char *buf, *p;
  const char *s;
  grn_mecab_tokenizer *tokenizer;
  unsigned int bufsize;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args);
  if (!query) {
    return NULL;
  }
  if (!sole_mecab) {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_new2() failed on mecab_init(): %s",
                         mecab_strerror(NULL));
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
  }
  if (!sole_mecab) {
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }

  if (query->encoding != sole_mecab_encoding) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "MeCab dictionary charset (%s) does not match "
                     "the table encoding: <%s>",
                     grn_enctostr(sole_mecab_encoding),
                     grn_enctostr(query->encoding));
    return NULL;
  }

  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][mecab] "
                     "memory allocation to grn_mecab_tokenizer failed");
    return NULL;
  }
  tokenizer->mecab = sole_mecab;
  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  tokenizer->have_tokenized_delimiter =
    grn_tokenizer_have_tokenized_delimiter(ctx,
                                           normalized_string,
                                           normalized_string_length,
                                           query->encoding);

  if (tokenizer->have_tokenized_delimiter) {
    tokenizer->buf = NULL;
    tokenizer->next = normalized_string;
    tokenizer->end = tokenizer->next + normalized_string_length;
  } else {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    s = mecab_sparse_tostr2(tokenizer->mecab,
                            normalized_string,
                            normalized_string_length);
    if (!s) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                       "[tokenizer][mecab] "
                       "mecab_sparse_tostr() failed len=%d err=%s",
                       normalized_string_length,
                       mecab_strerror(tokenizer->mecab));
    } else {
      bufsize = strlen(s) + 1;
      if (!(buf = GRN_PLUGIN_MALLOC(ctx, bufsize))) {
        GRN_PLUGIN_LOG(ctx, GRN_LOG_ALERT,
                       "[tokenizer][mecab] "
                       "buffer allocation on mecab_init failed !");
      } else {
        memcpy(buf, s, bufsize);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
    if (!s || !buf) {
      grn_tokenizer_query_close(ctx, tokenizer->query);
      GRN_PLUGIN_FREE(ctx, tokenizer);
      return NULL;
    }
    /* A certain version of mecab returns trailing lf or spaces. */
    for (p = buf + bufsize - 2;
         buf <= p && isspace(*(unsigned char *)p);
         p--) { *p = '\0'; }
    tokenizer->buf = buf;
    tokenizer->next = buf;
    tokenizer->end = p + 1;
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}
Ejemplo n.º 4
0
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_obj *str;
  int nflags = 0;
  char *buf, *p;
  const char *s;
  grn_obj *table = args[0];
  grn_obj_flags table_flags;
  grn_encoding table_encoding;
  grn_mecab_tokenizer *token;
  unsigned int bufsize, len;
  if (!(str = grn_ctx_pop(ctx))) {
    ERR(GRN_INVALID_ARGUMENT, "missing argument");
    return NULL;
  }
  if (!sole_mecab) {
    CRITICAL_SECTION_ENTER(sole_mecab_lock);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        ERR(GRN_TOKENIZER_ERROR, "mecab_new2 failed on grn_mecab_init: %s",
            mecab_strerror(NULL));
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    CRITICAL_SECTION_LEAVE(sole_mecab_lock);
  }
  if (!sole_mecab) {
    return NULL;
  }
  grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL);
  if (table_encoding != sole_mecab_encoding) {
    ERR(GRN_TOKENIZER_ERROR,
        "MeCab dictionary charset (%s) does not match the context encoding: <%s>",
        grn_enctostr(sole_mecab_encoding), grn_enctostr(table_encoding));
    return NULL;
  }
  if (!(token = GRN_MALLOC(sizeof(grn_mecab_tokenizer)))) { return NULL; }
  token->mecab = sole_mecab;
  token->encoding = table_encoding;
  nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
  if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
                                    nflags, token->encoding))) {
    GRN_FREE(token);
    ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
    return NULL;
  }
  len = token->nstr->norm_blen;
  CRITICAL_SECTION_ENTER(sole_mecab_lock);
  s = mecab_sparse_tostr2(token->mecab, token->nstr->norm, len);
  if (!s) {
    ERR(GRN_TOKENIZER_ERROR, "mecab_sparse_tostr failed len=%d err=%s",
        len, mecab_strerror(token->mecab));
  } else {
    bufsize = strlen(s) + 1;
    if (!(buf = GRN_MALLOC(bufsize))) {
      GRN_LOG(ctx, GRN_LOG_ALERT, "buffer allocation on mecab_init failed !");
    } else {
      memcpy(buf, s, bufsize);
    }
  }
  CRITICAL_SECTION_LEAVE(sole_mecab_lock);
  if (!s || !buf) {
    grn_str_close(ctx, token->nstr);
    GRN_FREE(token);
    return NULL;
  }
  /* A certain version of mecab returns trailing lf or spaces. */
  for (p = buf + bufsize - 2;
       buf <= p && isspace(*(unsigned char *)p);
       p--) { *p = '\0'; }
  user_data->ptr = token;
  token->buf = buf;
  token->next = buf;
  token->end = p + 1;
  GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY);
  GRN_UINT32_INIT(&token->stat_, 0);
  return NULL;
}