Пример #1
0
static grn_obj *
ngram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, uint8_t ngram_unit,
           uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank)
{
  unsigned int normalize_flags =
    GRN_STRING_REMOVE_BLANK |
    GRN_STRING_WITH_TYPES |
    GRN_STRING_REMOVE_TOKENIZED_DELIMITER;
  grn_tokenizer_query *query;
  const char *normalized;
  unsigned int normalized_length_in_bytes;
  grn_ngram_tokenizer *tokenizer;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
  if (!query) {
    return NULL;
  }

  if (!(tokenizer = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    ERR(GRN_NO_MEMORY_AVAILABLE,
        "[tokenizer][ngram] "
        "memory allocation to grn_ngram_tokenizer failed");
    return NULL;
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));
  tokenizer->query = query;

  tokenizer->uni_alpha = uni_alpha;
  tokenizer->uni_digit = uni_digit;
  tokenizer->uni_symbol = uni_symbol;
  tokenizer->ngram_unit = ngram_unit;
  tokenizer->ignore_blank = ignore_blank;
  tokenizer->overlap = 0;
  tokenizer->pos = 0;
  tokenizer->skip = 0;

  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
                            &normalized, &normalized_length_in_bytes,
                            &(tokenizer->len));
  tokenizer->next = (const unsigned char *)normalized;
  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
  tokenizer->ctypes =
    grn_string_get_types(ctx, tokenizer->query->normalized_query);
  return NULL;
}
Пример #2
0
static grn_obj *
regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  unsigned int normalize_flags = GRN_STRING_WITH_TYPES;
  grn_tokenizer_query *query;
  const char *normalized;
  unsigned int normalized_length_in_bytes;
  grn_regexp_tokenizer *tokenizer;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
  if (!query) {
    return NULL;
  }

  tokenizer = GRN_MALLOC(sizeof(grn_regexp_tokenizer));
  if (!tokenizer) {
    grn_tokenizer_query_close(ctx, query);
    ERR(GRN_NO_MEMORY_AVAILABLE,
        "[tokenizer][regexp] failed to allocate memory");
    return NULL;
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));
  tokenizer->query = query;

  tokenizer->get.n_skip_tokens = 0;

  tokenizer->is_begin = GRN_TRUE;
  tokenizer->is_end   = GRN_FALSE;
  tokenizer->is_start_token = GRN_TRUE;
  tokenizer->is_overlapping = GRN_FALSE;

  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
                            &normalized, &normalized_length_in_bytes,
                            NULL);
  tokenizer->next = normalized;
  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
  tokenizer->nth_char = 0;
  tokenizer->char_types =
    grn_string_get_types(ctx, tokenizer->query->normalized_query);

  GRN_TEXT_INIT(&(tokenizer->buffer), 0);

  return NULL;
}
Пример #3
0
static grn_obj *
yangram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data,
             unsigned short ngram_unit, grn_bool ignore_blank,
             grn_bool split_symbol, grn_bool split_alpha, grn_bool split_digit,
             grn_bool skip_overlap, unsigned short use_vgram)
{
  grn_tokenizer_query *query;
  unsigned int normalize_flags =
    GRN_STRING_WITH_TYPES |
    GRN_STRING_REMOVE_TOKENIZED_DELIMITER |
    GRN_STRING_REMOVE_BLANK;

  const char *normalized;
  unsigned int normalized_length_in_bytes;
  grn_yangram_tokenizer *tokenizer;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
  if (!query) {
    return NULL;
  }
  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_yangram_tokenizer)))) {
    GRN_PLUGIN_ERROR(ctx,GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][yangram] "
                     "memory allocation to grn_yangram_tokenizer failed");
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }
  user_data->ptr = tokenizer;
  grn_tokenizer_token_init(ctx, &(tokenizer->token));
  tokenizer->query = query;
  tokenizer->skip_overlap = skip_overlap;
  tokenizer->ignore_blank = ignore_blank;
  tokenizer->ngram_unit = ngram_unit;
  tokenizer->split_symbol = split_symbol;
  tokenizer->split_alpha = split_alpha;
  tokenizer->split_digit = split_digit;
  tokenizer->use_vgram = use_vgram;
  if (tokenizer->use_vgram > 0) {
    const char *vgram_word_table_name_env;
    vgram_word_table_name_env = getenv("GRN_VGRAM_WORD_TABLE_NAME");

    if (vgram_word_table_name_env) {
      tokenizer->vgram_table = grn_ctx_get(ctx,
                                           vgram_word_table_name_env,
                                           strlen(vgram_word_table_name_env));
    } else {
      tokenizer->vgram_table = grn_ctx_get(ctx,
                                           VGRAM_WORD_TABLE_NAME,
                                           strlen(VGRAM_WORD_TABLE_NAME));
    }
    if (!tokenizer->vgram_table) {
       GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                        "[tokenizer][yangram] "
                        "couldn't open a vgram table");
       tokenizer->vgram_table = NULL;
       return NULL;
    }
  } else {
    tokenizer->vgram_table = NULL;
  }
  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
                            &normalized, &normalized_length_in_bytes,
                            NULL);
  {
    const char *phrase_table_name_env;
    phrase_table_name_env = getenv("GRN_KNOWN_PHRASE_TABLE_NAME");

    if (phrase_table_name_env) {
      tokenizer->phrase_table = grn_ctx_get(ctx,
                                            phrase_table_name_env,
                                            strlen(phrase_table_name_env));
    } else {
      tokenizer->phrase_table = grn_ctx_get(ctx,
                                            KNOWN_PHRASE_TABLE_NAME,
                                            strlen(KNOWN_PHRASE_TABLE_NAME));
    }
    if (tokenizer->phrase_table) {
      if (!(tokenizer->hits =
          GRN_PLUGIN_MALLOC(ctx, sizeof(grn_pat_scan_hit) * MAX_N_HITS))) {
        GRN_PLUGIN_ERROR(ctx,GRN_NO_MEMORY_AVAILABLE,
                        "[tokenizer][yangram] "
                        "memory allocation to grn_pat_scan_hit failed");
        grn_tokenizer_query_close(ctx, query);
        return NULL;
      } else {
        tokenizer->scan_rest = normalized;
        tokenizer->nhits = 0;
        tokenizer->current_hit = 0;
      }
    } else {
     tokenizer->phrase_table = NULL;
    }
  }

  tokenizer->next = (const unsigned char *)normalized;
  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
  tokenizer->rest_length = tokenizer->end - tokenizer->next;
  tokenizer->ctypes =
    grn_string_get_types(ctx, tokenizer->query->normalized_query);

  tokenizer->pushed_token_tail = NULL;
  tokenizer->ctypes_next = 0;

  return NULL;
}