Пример #1
0
static grn_rc
mecab_next(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data)
{
  size_t cl;
  grn_mecab_tokenizer *token = user_data->ptr;
  const unsigned char *p = token->next, *r;
  const unsigned char *e = token->end;
  for (r = p; r < e; r += cl) {
    if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
      token->next = (unsigned char *)e;
      break;
    }
    if (grn_isspace((const char *)r, token->encoding)) {
      const unsigned char *q = r;
      while ((cl = grn_isspace((const char *)q, token->encoding))) { q += cl; }
      token->next = (unsigned char *)q;
      break;
    }
  }
  GRN_TEXT_SET_REF(&token->curr_, p, r - p);
  GRN_UINT32_SET(ctx, &token->stat_, r == e ? GRN_TOKEN_LAST : 0);
  grn_ctx_push(ctx, &token->curr_);
  grn_ctx_push(ctx, &token->stat_);
  return GRN_SUCCESS;
}
Пример #2
0
/*
  This function returns tokens one by one.
 */
static grn_obj *
mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  size_t cl;
  /* grn_obj *table = args[0]; */
  grn_mecab_tokenizer *token = user_data->ptr;
  char *p = token->next, *r;
  char *e = token->end;
  for (r = p; r < e; r += cl) {
    if (!(cl = grn_charlen_(ctx, r, e, token->encoding))) {
      token->next = e;
      break;
    }
    if (grn_isspace(r, token->encoding)) {
      char *q = r;
      while ((cl = grn_isspace(q, token->encoding))) { q += cl; }
      token->next = q;
      break;
    }
  }
  GRN_TEXT_SET_REF(&token->curr_, p, r - p);
  GRN_UINT32_SET(ctx, &token->stat_, r == e ? GRN_TOKEN_LAST : 0);
  grn_ctx_push(ctx, &token->curr_);
  grn_ctx_push(ctx, &token->stat_);
  return NULL;
}
Пример #3
0
void
grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_token *token,
                         const char *str_ptr, unsigned int str_length,
                         grn_token_status status)
{
  GRN_TEXT_SET_REF(&token->str, str_ptr, str_length);
  GRN_UINT32_SET(ctx, &token->status, status);
  grn_ctx_push(ctx, &token->str);
  grn_ctx_push(ctx, &token->status);
}
Пример #4
0
static grn_obj *
uvector_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_uvector_tokenizer_info *token = user_data->ptr;
  byte *p = token->curr + token->unit;
  if (token->tail < p) {
    GRN_TEXT_SET_REF(&token->curr_, token->curr, 0);
    GRN_UINT32_SET(ctx, &token->stat_, GRN_TOKEN_LAST);
  } else {
    GRN_TEXT_SET_REF(&token->curr_, token->curr, token->unit);
    token->curr = p;
    GRN_UINT32_SET(ctx, &token->stat_, token->tail == p ? GRN_TOKEN_LAST : 0);
  }
  grn_ctx_push(ctx, &token->curr_);
  grn_ctx_push(ctx, &token->stat_);
  return NULL;
}
Пример #5
0
static void
bench_setup_points(gpointer user_data,
                   const gchar *start_point_string,
                   const gchar *end_point_string,
                   grn_builtin_type wgs84_or_tgs)
{
  BenchmarkData *data = user_data;
  grn_obj start_point_text, end_point_text;

  GRN_TEXT_INIT(&start_point_text, 0);
  GRN_TEXT_INIT(&end_point_text, 0);
  GRN_TEXT_SETS(data->context, &start_point_text, start_point_string);
  GRN_TEXT_SETS(data->context, &end_point_text, end_point_string);

  data->start_point = grn_obj_open(data->context, GRN_BULK, 0, wgs84_or_tgs);
  data->end_point = grn_obj_open(data->context, GRN_BULK, 0, wgs84_or_tgs);
  grn_obj_cast(data->context, &start_point_text, data->start_point, GRN_FALSE);
  grn_obj_cast(data->context, &end_point_text, data->end_point, GRN_FALSE);
  grn_ctx_push(data->context, data->start_point);
  grn_ctx_push(data->context, data->end_point);

  grn_obj_unlink(data->context, &start_point_text);
  grn_obj_unlink(data->context, &end_point_text);
}
Пример #6
0
static grn_obj *
delimited_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  size_t cl;
  grn_delimited_tokenizer *token = user_data->ptr;
  const unsigned char *p = token->next, *r;
  const unsigned char *e = token->end;
  for (r = p; r < e; r += cl) {
    if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
      token->next = (unsigned char *)e;
      break;
    }
    if (r + token->delimiter_len <= e &&
        !memcmp(r, token->delimiter, token->delimiter_len)) {
      token->next = r + token->delimiter_len;
      break;
    }
  }
  GRN_TEXT_SET_REF(&token->curr_, p, r - p);
  GRN_UINT32_SET(ctx, &token->stat_, r == e ? GRN_TOKEN_LAST : 0);
  grn_ctx_push(ctx, &token->curr_);
  grn_ctx_push(ctx, &token->stat_);
  return NULL;
}
Пример #7
0
grn_token *
grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len,
               grn_token_mode mode)
{
  grn_token *token;
  grn_encoding encoding;
  grn_obj *tokenizer;
  if (grn_table_get_info(ctx, table, NULL, &encoding, &tokenizer)) { return NULL; }
  if (!(token = GRN_MALLOC(sizeof(grn_token)))) { return NULL; }
  token->table = table;
  token->mode = mode;
  token->encoding = encoding;
  token->tokenizer = tokenizer;
  token->orig = str;
  token->orig_blen = str_len;
  token->curr = NULL;
  token->curr_size = 0;
  token->pos = -1;
  token->status = grn_token_doing;
  token->force_prefix = 0;
  if (tokenizer) {
    grn_obj str_;
    GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY);
    GRN_TEXT_SET_REF(&str_, str, str_len);
    token->pctx.caller = NULL;
    token->pctx.user_data.ptr = NULL;
    token->pctx.proc = (grn_proc *)tokenizer;
    token->pctx.hooks = NULL;
    token->pctx.currh = NULL;
    token->pctx.phase = PROC_INIT;
    grn_ctx_push(ctx, &str_);
    ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token->pctx.user_data);
    grn_obj_close(ctx, &str_);
  }
  if (ctx->rc) {
    GRN_FREE(token);
    token = NULL;
  }
  return token;
}
Пример #8
0
static grn_obj *
ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  size_t cl;
  grn_ngram_tokenizer *token = user_data->ptr;
  const unsigned char *p = token->next, *r = p, *e = token->end;
  int32_t len = 0, pos = token->pos + token->skip, status = 0;
  uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL;
  if (cp && token->uni_alpha && GRN_STR_CTYPE(*cp) == grn_str_alpha) {
    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
      len++;
      r += cl;
      if (/* !token->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; }
      if (GRN_STR_CTYPE(*++cp) != grn_str_alpha) { break; }
    }
    token->next = r;
    token->overlap = 0;
  } else if (cp && token->uni_digit && GRN_STR_CTYPE(*cp) == grn_str_digit) {
    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
      len++;
      r += cl;
      if (/* !token->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; }
      if (GRN_STR_CTYPE(*++cp) != grn_str_digit) { break; }
    }
    token->next = r;
    token->overlap = 0;
  } else if (cp && token->uni_symbol && GRN_STR_CTYPE(*cp) == grn_str_symbol) {
    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
      len++;
      r += cl;
      if (!token->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
      if (GRN_STR_CTYPE(*++cp) != grn_str_symbol) { break; }
    }
    token->next = r;
    token->overlap = 0;
  } else {
#ifdef PRE_DEFINED_UNSPLIT_WORDS
    const unsigned char *key = NULL;
    // todo : grn_pat_lcp_search
    if ((tid = grn_sym_common_prefix_search(sym, p))) {
      if (!(key = _grn_sym_key(sym, tid))) {
        token->status = grn_token_not_found;
        return NULL;
      }
      len = grn_str_len(key, token->encoding, NULL);
    }
    r = p + grn_charlen_(ctx, p, e, token->encoding);
    if (tid && (len > 1 || r == p)) {
      if (r != p && pos + len - 1 <= token->tail) { continue; }
      p += strlen(key);
      if (!*p && !token->add) { token->status = grn_token_done; }
    }
#endif /* PRE_DEFINED_UNSPLIT_WORDS */
    if ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
      len++;
      r += cl;
      token->next = r;
      while (len < token->ngram_unit &&
             (cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
        if (cp) {
          if (!token->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
          cp++;
          if ((token->uni_alpha && GRN_STR_CTYPE(*cp) == grn_str_alpha) ||
              (token->uni_digit && GRN_STR_CTYPE(*cp) == grn_str_digit) ||
              (token->uni_symbol && GRN_STR_CTYPE(*cp) == grn_str_symbol)) { break; }
        }
        len++;
        r += cl;
      }
      if (token->overlap) { status |= GRN_TOKEN_OVERLAP; }
      if (len < token->ngram_unit) { status |= GRN_TOKEN_UNMATURED; }
      token->overlap = (len > 1) ? 1 : 0;
    }
  }
  token->pos = pos;
  token->len = len;
  token->tail = pos + len - 1;
  if (p == r || token->next == e) {
    token->skip = 0;
    status |= GRN_TOKEN_LAST;
  } else {
    token->skip = token->overlap ? 1 : len;
  }
  if (r == e) { status |= GRN_TOKEN_REACH_END; }
  GRN_TEXT_SET_REF(&token->curr_, p, r - p);
  GRN_UINT32_SET(ctx, &token->stat_, status);
  grn_ctx_push(ctx, &token->curr_);
  grn_ctx_push(ctx, &token->stat_);
  return NULL;
}
Пример #9
0
grn_token *
grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len,
               grn_token_mode mode, unsigned int flags)
{
  grn_token *token;
  grn_encoding encoding;
  grn_obj *tokenizer;
  grn_obj *normalizer;
  grn_obj_flags table_flags;
  if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer,
                         &normalizer)) {
    return NULL;
  }
  if (!(token = GRN_MALLOC(sizeof(grn_token)))) { return NULL; }
  token->table = table;
  token->mode = mode;
  token->encoding = encoding;
  token->tokenizer = tokenizer;
  token->orig = str;
  token->orig_blen = str_len;
  token->curr = NULL;
  token->nstr = NULL;
  token->curr_size = 0;
  token->pos = -1;
  token->status = GRN_TOKEN_DOING;
  token->force_prefix = 0;
  if (tokenizer) {
    grn_obj str_, flags_;
    GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY);
    GRN_TEXT_SET_REF(&str_, str, str_len);
    GRN_UINT32_INIT(&flags_, 0);
    GRN_UINT32_SET(ctx, &flags_, flags);
    token->pctx.caller = NULL;
    token->pctx.user_data.ptr = NULL;
    token->pctx.proc = (grn_proc *)tokenizer;
    token->pctx.hooks = NULL;
    token->pctx.currh = NULL;
    token->pctx.phase = PROC_INIT;
    grn_ctx_push(ctx, &str_);
    grn_ctx_push(ctx, &flags_);
    ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token->pctx.user_data);
    grn_obj_close(ctx, &flags_);
    grn_obj_close(ctx, &str_);
  } else {
    int nflags = 0;
    token->nstr = grn_string_open_(ctx, str, str_len,
                                   normalizer, nflags, token->encoding);
    if (token->nstr) {
      const char *normalized;
      grn_string_get_normalized(ctx, token->nstr,
                                &normalized, &(token->curr_size), NULL);
      token->curr = (const unsigned char *)normalized;
    } else {
      ERR(GRN_TOKENIZER_ERROR, "grn_string_open failed at grn_token_open");
    }
  }
  if (ctx->rc) {
    grn_token_close(ctx, token);
    token = NULL;
  }
  return token;
}