Esempio n. 1
0
static grn_obj *
ngram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, uint8_t ngram_unit,
           uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank)
{
  unsigned int normalize_flags =
    GRN_STRING_REMOVE_BLANK |
    GRN_STRING_WITH_TYPES |
    GRN_STRING_REMOVE_TOKENIZED_DELIMITER;
  grn_tokenizer_query *query;
  const char *normalized;
  unsigned int normalized_length_in_bytes;
  grn_ngram_tokenizer *tokenizer;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
  if (!query) {
    return NULL;
  }

  if (!(tokenizer = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    ERR(GRN_NO_MEMORY_AVAILABLE,
        "[tokenizer][ngram] "
        "memory allocation to grn_ngram_tokenizer failed");
    return NULL;
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));
  tokenizer->query = query;

  tokenizer->uni_alpha = uni_alpha;
  tokenizer->uni_digit = uni_digit;
  tokenizer->uni_symbol = uni_symbol;
  tokenizer->ngram_unit = ngram_unit;
  tokenizer->ignore_blank = ignore_blank;
  tokenizer->overlap = 0;
  tokenizer->pos = 0;
  tokenizer->skip = 0;

  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
                            &normalized, &normalized_length_in_bytes,
                            &(tokenizer->len));
  tokenizer->next = (const unsigned char *)normalized;
  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
  tokenizer->ctypes =
    grn_string_get_types(ctx, tokenizer->query->normalized_query);
  return NULL;
}
Esempio n. 2
0
static grn_obj *
regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  unsigned int normalize_flags = GRN_STRING_WITH_TYPES;
  grn_tokenizer_query *query;
  const char *normalized;
  unsigned int normalized_length_in_bytes;
  grn_regexp_tokenizer *tokenizer;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
  if (!query) {
    return NULL;
  }

  tokenizer = GRN_MALLOC(sizeof(grn_regexp_tokenizer));
  if (!tokenizer) {
    grn_tokenizer_query_close(ctx, query);
    ERR(GRN_NO_MEMORY_AVAILABLE,
        "[tokenizer][regexp] failed to allocate memory");
    return NULL;
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));
  tokenizer->query = query;

  tokenizer->get.n_skip_tokens = 0;

  tokenizer->is_begin = GRN_TRUE;
  tokenizer->is_end   = GRN_FALSE;
  tokenizer->is_start_token = GRN_TRUE;
  tokenizer->is_overlapping = GRN_FALSE;

  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
                            &normalized, &normalized_length_in_bytes,
                            NULL);
  tokenizer->next = normalized;
  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
  tokenizer->nth_char = 0;
  tokenizer->char_types =
    grn_string_get_types(ctx, tokenizer->query->normalized_query);

  GRN_TEXT_INIT(&(tokenizer->buffer), 0);

  return NULL;
}
Esempio n. 3
0
static grn_obj *
delimited_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data,
               const uint8_t *delimiter, uint32_t delimiter_len)
{
  grn_tokenizer_query *query;
  unsigned int normalize_flags = 0;
  const char *normalized;
  unsigned int normalized_length_in_bytes;
  grn_delimited_tokenizer *tokenizer;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
  if (!query) {
    return NULL;
  }

  if (!(tokenizer = GRN_MALLOC(sizeof(grn_delimited_tokenizer)))) {
    ERR(GRN_NO_MEMORY_AVAILABLE,
        "[tokenizer][delimit] "
        "memory allocation to grn_delimited_tokenizer failed");
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }
  user_data->ptr = tokenizer;

  tokenizer->query = query;

  tokenizer->have_tokenized_delimiter =
    grn_tokenizer_have_tokenized_delimiter(ctx,
                                           tokenizer->query->ptr,
                                           tokenizer->query->length,
                                           tokenizer->query->encoding);
  tokenizer->delimiter = delimiter;
  tokenizer->delimiter_len = delimiter_len;
  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
                            &normalized, &normalized_length_in_bytes,
                            NULL);
  tokenizer->next = (const unsigned char *)normalized;
  tokenizer->end = tokenizer->next + normalized_length_in_bytes;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}
Esempio n. 4
0
static grn_obj *
sample_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  sample_tokenizer *tokenizer;
  unsigned int normalizer_flags = 0;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
  if (!query) {
    return NULL;
  }

  tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(sample_tokenizer));
  if (!tokenizer) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][sample] "
                     "memory allocation to sample_tokenizer failed");
    return NULL;
  }

  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  tokenizer->next = normalized_string;
  tokenizer->rest = normalized_string_length;

  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}
Esempio n. 5
0
static grn_obj *
regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  unsigned int normalize_flags = 0;
  grn_tokenizer_query *query;
  const char *normalized;
  unsigned int normalized_length_in_bytes;
  grn_regexp_tokenizer *tokenizer;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
  if (!query) {
    return NULL;
  }

  tokenizer = GRN_MALLOC(sizeof(grn_regexp_tokenizer));
  if (!tokenizer) {
    grn_tokenizer_query_close(ctx, query);
    ERR(GRN_NO_MEMORY_AVAILABLE,
        "[tokenizer][regexp] failed to allocate memory");
    return NULL;
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));
  tokenizer->query = query;

  tokenizer->get.have_begin = GRN_FALSE;
  tokenizer->get.have_end   = GRN_FALSE;
  tokenizer->get.n_skip_tokens = 0;

  tokenizer->is_begin = GRN_TRUE;
  tokenizer->is_end   = GRN_FALSE;
  tokenizer->is_first_token = GRN_TRUE;
  tokenizer->is_overlapping = GRN_FALSE;

  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
                            &normalized, &normalized_length_in_bytes,
                            NULL);
  tokenizer->next = normalized;
  tokenizer->end = tokenizer->next + normalized_length_in_bytes;

  if (tokenizer->query->tokenize_mode == GRN_TOKEN_GET) {
    unsigned int query_length = tokenizer->query->length;
    if (query_length >= 2) {
      const char *query_string = tokenizer->query->ptr;
      grn_encoding encoding = tokenizer->query->encoding;
      if (query_string[0] == '\\' && query_string[1] == 'A') {
        tokenizer->get.have_begin = GRN_TRUE;
        /* TODO: It assumes that both "\\" and "A" are normalized to 1
           characters. Normalizer may omit character or expand to
           multiple characters. */
        tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end,
                                        encoding);
        tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end,
                                        encoding);
      }
      if (query_string[query_length - 2] == '\\' &&
          query_string[query_length - 1] == 'z') {
        tokenizer->get.have_end = GRN_TRUE;
        /* TODO: It assumes that both "\\" and "z" are normalized to 1
           byte characters. Normalizer may omit character or expand to
           multiple characters. */
        tokenizer->end -= grn_charlen_(ctx,
                                       tokenizer->end - 1,
                                       tokenizer->end,
                                       encoding);
        tokenizer->end -= grn_charlen_(ctx,
                                       tokenizer->end - 1,
                                       tokenizer->end,
                                       encoding);
      }
    }
  }

  GRN_TEXT_INIT(&(tokenizer->buffer), 0);

  return NULL;
}
Esempio n. 6
0
grn_tokenizer_query *
grn_tokenizer_query_create(grn_ctx *ctx, int num_args, grn_obj **args)
{
  return grn_tokenizer_query_open(ctx, num_args, args, 0);
}
Esempio n. 7
0
static grn_obj *
yangram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data,
             unsigned short ngram_unit, grn_bool ignore_blank,
             grn_bool split_symbol, grn_bool split_alpha, grn_bool split_digit,
             grn_bool skip_overlap, unsigned short use_vgram)
{
  grn_tokenizer_query *query;
  unsigned int normalize_flags =
    GRN_STRING_WITH_TYPES |
    GRN_STRING_REMOVE_TOKENIZED_DELIMITER |
    GRN_STRING_REMOVE_BLANK;

  const char *normalized;
  unsigned int normalized_length_in_bytes;
  grn_yangram_tokenizer *tokenizer;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
  if (!query) {
    return NULL;
  }
  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_yangram_tokenizer)))) {
    GRN_PLUGIN_ERROR(ctx,GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][yangram] "
                     "memory allocation to grn_yangram_tokenizer failed");
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }
  user_data->ptr = tokenizer;
  grn_tokenizer_token_init(ctx, &(tokenizer->token));
  tokenizer->query = query;
  tokenizer->skip_overlap = skip_overlap;
  tokenizer->ignore_blank = ignore_blank;
  tokenizer->ngram_unit = ngram_unit;
  tokenizer->split_symbol = split_symbol;
  tokenizer->split_alpha = split_alpha;
  tokenizer->split_digit = split_digit;
  tokenizer->use_vgram = use_vgram;
  if (tokenizer->use_vgram > 0) {
    const char *vgram_word_table_name_env;
    vgram_word_table_name_env = getenv("GRN_VGRAM_WORD_TABLE_NAME");

    if (vgram_word_table_name_env) {
      tokenizer->vgram_table = grn_ctx_get(ctx,
                                           vgram_word_table_name_env,
                                           strlen(vgram_word_table_name_env));
    } else {
      tokenizer->vgram_table = grn_ctx_get(ctx,
                                           VGRAM_WORD_TABLE_NAME,
                                           strlen(VGRAM_WORD_TABLE_NAME));
    }
    if (!tokenizer->vgram_table) {
       GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                        "[tokenizer][yangram] "
                        "couldn't open a vgram table");
       tokenizer->vgram_table = NULL;
       return NULL;
    }
  } else {
    tokenizer->vgram_table = NULL;
  }
  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
                            &normalized, &normalized_length_in_bytes,
                            NULL);
  {
    const char *phrase_table_name_env;
    phrase_table_name_env = getenv("GRN_KNOWN_PHRASE_TABLE_NAME");

    if (phrase_table_name_env) {
      tokenizer->phrase_table = grn_ctx_get(ctx,
                                            phrase_table_name_env,
                                            strlen(phrase_table_name_env));
    } else {
      tokenizer->phrase_table = grn_ctx_get(ctx,
                                            KNOWN_PHRASE_TABLE_NAME,
                                            strlen(KNOWN_PHRASE_TABLE_NAME));
    }
    if (tokenizer->phrase_table) {
      if (!(tokenizer->hits =
          GRN_PLUGIN_MALLOC(ctx, sizeof(grn_pat_scan_hit) * MAX_N_HITS))) {
        GRN_PLUGIN_ERROR(ctx,GRN_NO_MEMORY_AVAILABLE,
                        "[tokenizer][yangram] "
                        "memory allocation to grn_pat_scan_hit failed");
        grn_tokenizer_query_close(ctx, query);
        return NULL;
      } else {
        tokenizer->scan_rest = normalized;
        tokenizer->nhits = 0;
        tokenizer->current_hit = 0;
      }
    } else {
     tokenizer->phrase_table = NULL;
    }
  }

  tokenizer->next = (const unsigned char *)normalized;
  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
  tokenizer->rest_length = tokenizer->end - tokenizer->next;
  tokenizer->ctypes =
    grn_string_get_types(ctx, tokenizer->query->normalized_query);

  tokenizer->pushed_token_tail = NULL;
  tokenizer->ctypes_next = 0;

  return NULL;
}
Esempio n. 8
0
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_mecab_tokenizer *tokenizer;
  unsigned int normalizer_flags = 0;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
  if (!query) {
    return NULL;
  }
  if (!sole_mecab) {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_new2() failed on mecab_init(): %s",
                         mecab_global_error_message());
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
  }
  if (!sole_mecab) {
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }

  if (query->encoding != sole_mecab_encoding) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "MeCab dictionary charset (%s) does not match "
                     "the table encoding: <%s>",
                     grn_encoding_to_string(sole_mecab_encoding),
                     grn_encoding_to_string(query->encoding));
    return NULL;
  }

  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][mecab] "
                     "memory allocation to grn_mecab_tokenizer failed");
    return NULL;
  }
  tokenizer->mecab = sole_mecab;
  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  GRN_TEXT_INIT(&(tokenizer->buf), 0);
  if (query->have_tokenized_delimiter) {
    tokenizer->next = normalized_string;
    tokenizer->end = tokenizer->next + normalized_string_length;
  } else if (normalized_string_length == 0) {
    tokenizer->next = "";
    tokenizer->end = tokenizer->next;
  } else {
    grn_bool succeeded;
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (grn_mecab_chunked_tokenize_enabled &&
        ctx->encoding == GRN_ENC_UTF8) {
      succeeded = chunked_tokenize_utf8(ctx,
                                        tokenizer,
                                        normalized_string,
                                        normalized_string_length);
    } else {
      const char *s;
      s = mecab_sparse_tostr2(tokenizer->mecab,
                              normalized_string,
                              normalized_string_length);
      if (!s) {
        succeeded = GRN_FALSE;
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_sparse_tostr() failed len=%d err=%s",
                         normalized_string_length,
                         mecab_strerror(tokenizer->mecab));
      } else {
        succeeded = GRN_TRUE;
        GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
    if (!succeeded) {
      grn_tokenizer_query_close(ctx, tokenizer->query);
      GRN_PLUGIN_FREE(ctx, tokenizer);
      return NULL;
    }
    {
      char *buf, *p;
      unsigned int bufsize;

      buf = GRN_TEXT_VALUE(&(tokenizer->buf));
      bufsize = GRN_TEXT_LEN(&(tokenizer->buf));
      /* A certain version of mecab returns trailing lf or spaces. */
      for (p = buf + bufsize - 2;
           buf <= p && isspace(*(unsigned char *)p);
           p--) { *p = '\0'; }
      tokenizer->next = buf;
      tokenizer->end = p + 1;
    }
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}
Esempio n. 9
0
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  char *buf, *p;
  const char *s;
  grn_mecab_tokenizer *tokenizer;
  unsigned int bufsize;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args);
  if (!query) {
    return NULL;
  }
  if (!sole_mecab) {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_new2() failed on mecab_init(): %s",
                         mecab_strerror(NULL));
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
  }
  if (!sole_mecab) {
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }

  if (query->encoding != sole_mecab_encoding) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "MeCab dictionary charset (%s) does not match "
                     "the table encoding: <%s>",
                     grn_enctostr(sole_mecab_encoding),
                     grn_enctostr(query->encoding));
    return NULL;
  }

  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][mecab] "
                     "memory allocation to grn_mecab_tokenizer failed");
    return NULL;
  }
  tokenizer->mecab = sole_mecab;
  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  tokenizer->have_tokenized_delimiter =
    grn_tokenizer_have_tokenized_delimiter(ctx,
                                           normalized_string,
                                           normalized_string_length,
                                           query->encoding);

  if (tokenizer->have_tokenized_delimiter) {
    tokenizer->buf = NULL;
    tokenizer->next = normalized_string;
    tokenizer->end = tokenizer->next + normalized_string_length;
  } else {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    s = mecab_sparse_tostr2(tokenizer->mecab,
                            normalized_string,
                            normalized_string_length);
    if (!s) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                       "[tokenizer][mecab] "
                       "mecab_sparse_tostr() failed len=%d err=%s",
                       normalized_string_length,
                       mecab_strerror(tokenizer->mecab));
    } else {
      bufsize = strlen(s) + 1;
      if (!(buf = GRN_PLUGIN_MALLOC(ctx, bufsize))) {
        GRN_PLUGIN_LOG(ctx, GRN_LOG_ALERT,
                       "[tokenizer][mecab] "
                       "buffer allocation on mecab_init failed !");
      } else {
        memcpy(buf, s, bufsize);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
    if (!s || !buf) {
      grn_tokenizer_query_close(ctx, tokenizer->query);
      GRN_PLUGIN_FREE(ctx, tokenizer);
      return NULL;
    }
    /* A certain version of mecab returns trailing lf or spaces. */
    for (p = buf + bufsize - 2;
         buf <= p && isspace(*(unsigned char *)p);
         p--) { *p = '\0'; }
    tokenizer->buf = buf;
    tokenizer->next = buf;
    tokenizer->end = p + 1;
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}