static grn_rc mecab_next(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data) { size_t cl; grn_mecab_tokenizer *token = user_data->ptr; const unsigned char *p = token->next, *r; const unsigned char *e = token->end; for (r = p; r < e; r += cl) { if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { token->next = (unsigned char *)e; break; } if (grn_isspace((const char *)r, token->encoding)) { const unsigned char *q = r; while ((cl = grn_isspace((const char *)q, token->encoding))) { q += cl; } token->next = (unsigned char *)q; break; } } GRN_TEXT_SET_REF(&token->curr_, p, r - p); GRN_UINT32_SET(ctx, &token->stat_, r == e ? GRN_TOKEN_LAST : 0); grn_ctx_push(ctx, &token->curr_); grn_ctx_push(ctx, &token->stat_); return GRN_SUCCESS; }
/* This function returns tokens one by one. */ static grn_obj * mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { size_t cl; /* grn_obj *table = args[0]; */ grn_mecab_tokenizer *token = user_data->ptr; char *p = token->next, *r; char *e = token->end; for (r = p; r < e; r += cl) { if (!(cl = grn_charlen_(ctx, r, e, token->encoding))) { token->next = e; break; } if (grn_isspace(r, token->encoding)) { char *q = r; while ((cl = grn_isspace(q, token->encoding))) { q += cl; } token->next = q; break; } } GRN_TEXT_SET_REF(&token->curr_, p, r - p); GRN_UINT32_SET(ctx, &token->stat_, r == e ? GRN_TOKEN_LAST : 0); grn_ctx_push(ctx, &token->curr_); grn_ctx_push(ctx, &token->stat_); return NULL; }
void grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_token *token, const char *str_ptr, unsigned int str_length, grn_token_status status) { GRN_TEXT_SET_REF(&token->str, str_ptr, str_length); GRN_UINT32_SET(ctx, &token->status, status); grn_ctx_push(ctx, &token->str); grn_ctx_push(ctx, &token->status); }
static grn_obj * uvector_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_uvector_tokenizer_info *token = user_data->ptr; byte *p = token->curr + token->unit; if (token->tail < p) { GRN_TEXT_SET_REF(&token->curr_, token->curr, 0); GRN_UINT32_SET(ctx, &token->stat_, GRN_TOKEN_LAST); } else { GRN_TEXT_SET_REF(&token->curr_, token->curr, token->unit); token->curr = p; GRN_UINT32_SET(ctx, &token->stat_, token->tail == p ? GRN_TOKEN_LAST : 0); } grn_ctx_push(ctx, &token->curr_); grn_ctx_push(ctx, &token->stat_); return NULL; }
static void bench_setup_points(gpointer user_data, const gchar *start_point_string, const gchar *end_point_string, grn_builtin_type wgs84_or_tgs) { BenchmarkData *data = user_data; grn_obj start_point_text, end_point_text; GRN_TEXT_INIT(&start_point_text, 0); GRN_TEXT_INIT(&end_point_text, 0); GRN_TEXT_SETS(data->context, &start_point_text, start_point_string); GRN_TEXT_SETS(data->context, &end_point_text, end_point_string); data->start_point = grn_obj_open(data->context, GRN_BULK, 0, wgs84_or_tgs); data->end_point = grn_obj_open(data->context, GRN_BULK, 0, wgs84_or_tgs); grn_obj_cast(data->context, &start_point_text, data->start_point, GRN_FALSE); grn_obj_cast(data->context, &end_point_text, data->end_point, GRN_FALSE); grn_ctx_push(data->context, data->start_point); grn_ctx_push(data->context, data->end_point); grn_obj_unlink(data->context, &start_point_text); grn_obj_unlink(data->context, &end_point_text); }
static grn_obj * delimited_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { size_t cl; grn_delimited_tokenizer *token = user_data->ptr; const unsigned char *p = token->next, *r; const unsigned char *e = token->end; for (r = p; r < e; r += cl) { if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { token->next = (unsigned char *)e; break; } if (r + token->delimiter_len <= e && !memcmp(r, token->delimiter, token->delimiter_len)) { token->next = r + token->delimiter_len; break; } } GRN_TEXT_SET_REF(&token->curr_, p, r - p); GRN_UINT32_SET(ctx, &token->stat_, r == e ? GRN_TOKEN_LAST : 0); grn_ctx_push(ctx, &token->curr_); grn_ctx_push(ctx, &token->stat_); return NULL; }
grn_token * grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len, grn_token_mode mode) { grn_token *token; grn_encoding encoding; grn_obj *tokenizer; if (grn_table_get_info(ctx, table, NULL, &encoding, &tokenizer)) { return NULL; } if (!(token = GRN_MALLOC(sizeof(grn_token)))) { return NULL; } token->table = table; token->mode = mode; token->encoding = encoding; token->tokenizer = tokenizer; token->orig = str; token->orig_blen = str_len; token->curr = NULL; token->curr_size = 0; token->pos = -1; token->status = grn_token_doing; token->force_prefix = 0; if (tokenizer) { grn_obj str_; GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY); GRN_TEXT_SET_REF(&str_, str, str_len); token->pctx.caller = NULL; token->pctx.user_data.ptr = NULL; token->pctx.proc = (grn_proc *)tokenizer; token->pctx.hooks = NULL; token->pctx.currh = NULL; token->pctx.phase = PROC_INIT; grn_ctx_push(ctx, &str_); ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token->pctx.user_data); grn_obj_close(ctx, &str_); } if (ctx->rc) { GRN_FREE(token); token = NULL; } return token; }
static grn_obj * ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { size_t cl; grn_ngram_tokenizer *token = user_data->ptr; const unsigned char *p = token->next, *r = p, *e = token->end; int32_t len = 0, pos = token->pos + token->skip, status = 0; uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL; if (cp && token->uni_alpha && GRN_STR_CTYPE(*cp) == grn_str_alpha) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; if (/* !token->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != grn_str_alpha) { break; } } token->next = r; token->overlap = 0; } else if (cp && token->uni_digit && GRN_STR_CTYPE(*cp) == grn_str_digit) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; if (/* !token->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != grn_str_digit) { break; } } token->next = r; token->overlap = 0; } else if (cp && token->uni_symbol && GRN_STR_CTYPE(*cp) == grn_str_symbol) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; if (!token->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != grn_str_symbol) { break; } } token->next = r; token->overlap = 0; } else { #ifdef PRE_DEFINED_UNSPLIT_WORDS const unsigned char *key = NULL; // todo : grn_pat_lcp_search if ((tid = grn_sym_common_prefix_search(sym, p))) { if (!(key = _grn_sym_key(sym, tid))) { token->status = grn_token_not_found; return NULL; } len = grn_str_len(key, token->encoding, NULL); } r = p + grn_charlen_(ctx, p, e, token->encoding); if (tid && (len > 1 || r == p)) { if (r != p && pos + len - 1 <= token->tail) { continue; } p += strlen(key); if (!*p && !token->add) { token->status = grn_token_done; } } #endif /* PRE_DEFINED_UNSPLIT_WORDS */ if ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; token->next = r; while (len < token->ngram_unit && (cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { if (cp) { if (!token->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } cp++; if ((token->uni_alpha && GRN_STR_CTYPE(*cp) == grn_str_alpha) || (token->uni_digit && GRN_STR_CTYPE(*cp) == grn_str_digit) || (token->uni_symbol && GRN_STR_CTYPE(*cp) == grn_str_symbol)) { break; } } len++; r += cl; } if (token->overlap) { status |= GRN_TOKEN_OVERLAP; } if (len < token->ngram_unit) { status |= GRN_TOKEN_UNMATURED; } token->overlap = (len > 1) ? 1 : 0; } } token->pos = pos; token->len = len; token->tail = pos + len - 1; if (p == r || token->next == e) { token->skip = 0; status |= GRN_TOKEN_LAST; } else { token->skip = token->overlap ? 1 : len; } if (r == e) { status |= GRN_TOKEN_REACH_END; } GRN_TEXT_SET_REF(&token->curr_, p, r - p); GRN_UINT32_SET(ctx, &token->stat_, status); grn_ctx_push(ctx, &token->curr_); grn_ctx_push(ctx, &token->stat_); return NULL; }
grn_token * grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len, grn_token_mode mode, unsigned int flags) { grn_token *token; grn_encoding encoding; grn_obj *tokenizer; grn_obj *normalizer; grn_obj_flags table_flags; if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer, &normalizer)) { return NULL; } if (!(token = GRN_MALLOC(sizeof(grn_token)))) { return NULL; } token->table = table; token->mode = mode; token->encoding = encoding; token->tokenizer = tokenizer; token->orig = str; token->orig_blen = str_len; token->curr = NULL; token->nstr = NULL; token->curr_size = 0; token->pos = -1; token->status = GRN_TOKEN_DOING; token->force_prefix = 0; if (tokenizer) { grn_obj str_, flags_; GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY); GRN_TEXT_SET_REF(&str_, str, str_len); GRN_UINT32_INIT(&flags_, 0); GRN_UINT32_SET(ctx, &flags_, flags); token->pctx.caller = NULL; token->pctx.user_data.ptr = NULL; token->pctx.proc = (grn_proc *)tokenizer; token->pctx.hooks = NULL; token->pctx.currh = NULL; token->pctx.phase = PROC_INIT; grn_ctx_push(ctx, &str_); grn_ctx_push(ctx, &flags_); ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token->pctx.user_data); grn_obj_close(ctx, &flags_); grn_obj_close(ctx, &str_); } else { int nflags = 0; token->nstr = grn_string_open_(ctx, str, str_len, normalizer, nflags, token->encoding); if (token->nstr) { const char *normalized; grn_string_get_normalized(ctx, token->nstr, &normalized, &(token->curr_size), NULL); token->curr = (const unsigned char *)normalized; } else { ERR(GRN_TOKENIZER_ERROR, "grn_string_open failed at grn_token_open"); } } if (ctx->rc) { grn_token_close(ctx, token); token = NULL; } return token; }