static grn_obj * sample_next(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args, grn_user_data *user_data) { sample_tokenizer *tokenizer = user_data->ptr; grn_encoding encoding = tokenizer->query->encoding; grn_tokenizer_status status; const char *token; int token_length; token = tokenizer->next; token_length = grn_plugin_charlen(ctx, token, tokenizer->rest, encoding); if (token_length == 0 || tokenizer->rest - token_length == 0) { status = GRN_TOKENIZER_LAST; } else { status = GRN_TOKENIZER_CONTINUE; } grn_tokenizer_token_push(ctx, &(tokenizer->token), token, token_length, status); tokenizer->next += token_length; tokenizer->rest -= token_length; return NULL; }
static grn_obj * delimited_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_delimited_tokenizer *tokenizer = user_data->ptr; if (tokenizer->have_tokenized_delimiter) { unsigned int rest_length; rest_length = tokenizer->end - tokenizer->next; tokenizer->next = (unsigned char *)grn_tokenizer_tokenized_delimiter_next( ctx, &(tokenizer->token), (const char *)tokenizer->next, rest_length, tokenizer->query->encoding); } else { size_t cl; const unsigned char *p = tokenizer->next, *r; const unsigned char *e = tokenizer->end; grn_token_status status; for (r = p; r < e; r += cl) { if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { tokenizer->next = (unsigned char *)e; break; } { grn_bool found_delimiter = GRN_FALSE; const unsigned char *current_end = r; while (current_end + tokenizer->delimiter_len <= e && !memcmp(current_end, tokenizer->delimiter, tokenizer->delimiter_len)) { current_end += tokenizer->delimiter_len; tokenizer->next = current_end; found_delimiter = GRN_TRUE; } if (found_delimiter) { break; } } } if (r == e) { status = GRN_TOKEN_LAST; } else { status = GRN_TOKEN_CONTINUE; } grn_tokenizer_token_push(ctx, &(tokenizer->token), (const char *)p, r - p, status); } return NULL; }
/* This function returns tokens one by one. */ static grn_obj * mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { /* grn_obj *table = args[0]; */ grn_mecab_tokenizer *tokenizer = user_data->ptr; grn_encoding encoding = tokenizer->query->encoding; if (tokenizer->query->have_tokenized_delimiter) { tokenizer->next = grn_tokenizer_tokenized_delimiter_next(ctx, &(tokenizer->token), tokenizer->next, tokenizer->end - tokenizer->next, encoding); } else { size_t cl; const char *p = tokenizer->next, *r; const char *e = tokenizer->end; grn_tokenizer_status status; for (r = p; r < e; r += cl) { int space_len; space_len = grn_isspace(r, encoding); if (space_len > 0 && r == p) { cl = space_len; p = r + cl; continue; } if (!(cl = grn_charlen_(ctx, r, e, encoding))) { tokenizer->next = e; break; } if (space_len > 0) { const char *q = r + space_len; while (q < e && (space_len = grn_isspace(q, encoding))) { q += space_len; } tokenizer->next = q; break; } } if (r == e || tokenizer->next == e) { status = GRN_TOKENIZER_LAST; } else { status = GRN_TOKENIZER_CONTINUE; } grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status); } return NULL; }
static grn_obj * uvector_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_uvector_tokenizer *tokenizer = user_data->ptr; byte *p = tokenizer->curr + tokenizer->unit; if (tokenizer->tail < p) { grn_tokenizer_token_push(ctx, &(tokenizer->token), (const char *)tokenizer->curr, 0, GRN_TOKEN_LAST); } else { grn_token_status status; if (tokenizer->tail == p) { status = GRN_TOKEN_LAST; } else { status = GRN_TOKEN_CONTINUE; } grn_tokenizer_token_push(ctx, &(tokenizer->token), (const char *)tokenizer->curr, tokenizer->unit, status); tokenizer->curr = p; } return NULL; }
static grn_obj * delimited_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_delimited_tokenizer *tokenizer = user_data->ptr; if (tokenizer->have_tokenized_delimiter) { unsigned int rest_length; rest_length = tokenizer->end - tokenizer->next; tokenizer->next = grn_tokenizer_tokenized_delimiter_next(ctx, &(tokenizer->token), tokenizer->next, rest_length, tokenizer->query->encoding); } else { size_t cl; const unsigned char *p = tokenizer->next, *r; const unsigned char *e = tokenizer->end; grn_tokenizer_status status; for (r = p; r < e; r += cl) { if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { tokenizer->next = (unsigned char *)e; break; } if (r + tokenizer->delimiter_len <= e && !memcmp(r, tokenizer->delimiter, tokenizer->delimiter_len)) { tokenizer->next = r + tokenizer->delimiter_len; break; } } if (r == e) { status = GRN_TOKENIZER_LAST; } else { status = GRN_TOKENIZER_CONTINUE; } grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status); } return NULL; }
const char * grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx, grn_tokenizer_token *token, const char *str_ptr, unsigned int str_length, grn_encoding encoding) { size_t char_length = 0; const char *start = str_ptr; const char *current; const char *end = str_ptr + str_length; const char *next_start = NULL; unsigned int token_length; grn_token_status status; for (current = start; current < end; current += char_length) { char_length = grn_charlen_(ctx, current, end, encoding); if (char_length == 0) { break; } if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length, encoding)) { next_start = str_ptr + (current - start + char_length); break; } } token_length = current - start; if (current == end) { status = GRN_TOKENIZER_LAST; } else { status = GRN_TOKENIZER_CONTINUE; } grn_tokenizer_token_push(ctx, token, start, token_length, status); return next_start; }
static grn_obj * regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { int char_len; grn_token_status status = 0; grn_regexp_tokenizer *tokenizer = user_data->ptr; unsigned int n_characters = 0; int ngram_unit = 2; grn_obj *buffer = &(tokenizer->buffer); const char *current = tokenizer->next; const char *end = tokenizer->end; grn_tokenize_mode mode = tokenizer->query->tokenize_mode; grn_bool escaping = GRN_FALSE; GRN_BULK_REWIND(buffer); if (mode == GRN_TOKEN_GET) { if (tokenizer->get.have_begin) { grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_BEGIN_MARK_UTF8, GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN, status); tokenizer->get.have_begin = GRN_FALSE; return NULL; } if (tokenizer->is_end && tokenizer->get.have_end) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_END_MARK_UTF8, GRN_TOKENIZER_END_MARK_UTF8_LEN, status); return NULL; } } else { if (tokenizer->is_begin) { grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_BEGIN_MARK_UTF8, GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN, status); tokenizer->is_begin = GRN_FALSE; return NULL; } if (tokenizer->is_end) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_END_MARK_UTF8, GRN_TOKENIZER_END_MARK_UTF8_LEN, status); return NULL; } } char_len = grn_charlen_(ctx, current, end, tokenizer->query->encoding); if (char_len == 0) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status); return NULL; } while (GRN_TRUE) { if (!escaping && mode == GRN_TOKEN_GET && char_len == 1 && current[0] == '\\') { current += char_len; escaping = GRN_TRUE; } else { n_characters++; GRN_TEXT_PUT(ctx, buffer, current, char_len); current += char_len; escaping = GRN_FALSE; if (n_characters == 1) { tokenizer->next = current; } if (n_characters == ngram_unit) { break; } } char_len = grn_charlen_(ctx, (const char *)current, (const char *)end, tokenizer->query->encoding); if (char_len == 0) { break; } } if (tokenizer->is_overlapping) { status |= GRN_TOKEN_OVERLAP; } if (n_characters < ngram_unit) { status |= GRN_TOKEN_UNMATURED; } tokenizer->is_overlapping = (n_characters > 1); if (mode == GRN_TOKEN_GET) { if ((end - tokenizer->next) < ngram_unit) { if (tokenizer->get.have_end) { if (tokenizer->next == end) { tokenizer->is_end = GRN_TRUE; } if (status & GRN_TOKEN_UNMATURED) { if (tokenizer->is_first_token) { status |= GRN_TOKEN_FORCE_PREFIX; } else { status |= GRN_TOKEN_SKIP; } } } else { tokenizer->is_end = GRN_TRUE; status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; if (status & GRN_TOKEN_UNMATURED) { status |= GRN_TOKEN_FORCE_PREFIX; } } } else { if (tokenizer->get.n_skip_tokens > 0) { tokenizer->get.n_skip_tokens--; status |= GRN_TOKEN_SKIP; } else { tokenizer->get.n_skip_tokens = ngram_unit - 1; } } } else { if (tokenizer->next == end) { tokenizer->is_end = GRN_TRUE; } } grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TEXT_VALUE(buffer), GRN_TEXT_LEN(buffer), status); tokenizer->is_first_token = GRN_FALSE; return NULL; }
static grn_obj * ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { size_t cl; grn_ngram_tokenizer *tokenizer = user_data->ptr; const unsigned char *p = tokenizer->next, *r = p, *e = tokenizer->end; int32_t len = 0, pos = tokenizer->pos + tokenizer->skip; grn_token_status status = 0; const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL; if (cp && tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_ALPHA) { break; } } tokenizer->next = r; tokenizer->overlap = 0; } else if (cp && tokenizer->uni_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_DIGIT) { break; } } tokenizer->next = r; tokenizer->overlap = 0; } else if (cp && tokenizer->uni_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_SYMBOL) { break; } } tokenizer->next = r; tokenizer->overlap = 0; } else { #ifdef PRE_DEFINED_UNSPLIT_WORDS const unsigned char *key = NULL; // todo : grn_pat_lcp_search if ((tid = grn_sym_common_prefix_search(sym, p))) { if (!(key = _grn_sym_key(sym, tid))) { tokenizer->status = GRN_TOKEN_CURSOR_NOT_FOUND; return NULL; } len = grn_str_len(key, tokenizer->query->encoding, NULL); } r = p + grn_charlen_(ctx, p, e, tokenizer->query->encoding); if (tid && (len > 1 || r == p)) { if (r != p && pos + len - 1 <= tokenizer->tail) { continue; } p += strlen(key); if (!*p && tokenizer->mode == GRN_TOKEN_GET) { tokenizer->status = GRN_TOKEN_CURSOR_DONE; } } #endif /* PRE_DEFINED_UNSPLIT_WORDS */ if ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; tokenizer->next = r; while (len < tokenizer->ngram_unit && (cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { if (cp) { if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } cp++; if ((tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) || (tokenizer->uni_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) || (tokenizer->uni_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL)) { break; } } len++; r += cl; } if (tokenizer->overlap) { status |= GRN_TOKEN_OVERLAP; } if (len < tokenizer->ngram_unit) { status |= GRN_TOKEN_UNMATURED; } tokenizer->overlap = (len > 1) ? 1 : 0; } } tokenizer->pos = pos; tokenizer->len = len; tokenizer->tail = pos + len - 1; if (p == r || tokenizer->next == e) { tokenizer->skip = 0; status |= GRN_TOKEN_LAST; } else { tokenizer->skip = tokenizer->overlap ? 1 : len; } if (r == e) { status |= GRN_TOKEN_REACH_END; } grn_tokenizer_token_push(ctx, &(tokenizer->token), (const char *)p, r - p, status); return NULL; }
static grn_obj * yangram_next(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args, grn_user_data *user_data) { grn_yangram_tokenizer *tokenizer = user_data->ptr; const unsigned char *string_end = tokenizer->end; const unsigned char *token_top = tokenizer->next; const unsigned char *token_next = token_top; const unsigned char *token_tail = token_top; int token_size = 0; grn_bool is_token_grouped = GRN_FALSE; const unsigned char *token_ctypes = NULL; unsigned int ctypes_skip_size; int char_length = 0; grn_tokenizer_status status = 0; grn_bool is_token_hit = GRN_FALSE; grn_obj *lexicon = args[0]; if (tokenizer->phrase_table) { if (tokenizer->nhits > 0 && token_top - (const unsigned char *)tokenizer->scan_start > tokenizer->hits[tokenizer->current_hit].offset) { tokenizer->current_hit++; } if (tokenizer->current_hit >= tokenizer->nhits) { tokenizer->scan_start = tokenizer->scan_rest; unsigned int scan_rest_length = tokenizer->end - (const unsigned char *)tokenizer->scan_rest; if (scan_rest_length > 0) { tokenizer->nhits = grn_pat_scan(ctx, (grn_pat *)tokenizer->phrase_table, tokenizer->scan_rest, scan_rest_length, tokenizer->hits, MAX_N_HITS, &(tokenizer->scan_rest)); tokenizer->current_hit = 0; } } if (tokenizer->nhits > 0 && tokenizer->current_hit < tokenizer->nhits && token_top - (const unsigned char *)tokenizer->scan_start == tokenizer->hits[tokenizer->current_hit].offset) { is_token_hit = GRN_TRUE; } } if (tokenizer->ctypes) { token_ctypes = tokenizer->ctypes + tokenizer->ctypes_next; } else { token_ctypes = NULL; } if (is_token_hit) { token_size = forward_scan_hit_token_tail(ctx, tokenizer, &token_tail, tokenizer->hits[tokenizer->current_hit].length); token_next = token_tail; tokenizer->current_hit++; } else { is_token_grouped = is_token_group(tokenizer, token_ctypes); if (is_token_grouped) { token_size = forward_grouped_token_tail(ctx, tokenizer, token_ctypes, &token_tail); token_next = token_tail; } else { token_size = forward_ngram_token_tail(ctx, tokenizer, token_ctypes, &token_tail); char_length = grn_plugin_charlen(ctx, (char *)token_next, tokenizer->rest_length, tokenizer->query->encoding); token_next += char_length; } } if (token_top == token_tail || token_next == string_end) { ctypes_skip_size = 0; } else { if (is_token_grouped || is_token_hit) { ctypes_skip_size = token_size; } else { ctypes_skip_size = 1; } } if (tokenizer->use_vgram > 0 && !is_token_grouped) { grn_bool maybe_vgram = GRN_FALSE; grn_id id; id = grn_table_get(ctx, tokenizer->vgram_table, (const char *)token_top, token_tail - token_top); if (id) { maybe_vgram = GRN_TRUE; } if (tokenizer->use_vgram >= VGRAM_BOTH && !maybe_vgram) { if (token_tail < string_end && !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) { grn_id id; const unsigned char *token_next_tail; char_length = grn_plugin_charlen(ctx, (char *)token_tail, tokenizer->rest_length, tokenizer->query->encoding); token_next_tail = token_tail + char_length; id = grn_table_get(ctx, tokenizer->vgram_table, (const char *)token_next, token_next_tail - token_next); if (id) { maybe_vgram = GRN_TRUE; } } else if (token_tail == string_end && tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) { maybe_vgram = GRN_TRUE; } } if (maybe_vgram) { if (token_tail < string_end && !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) { char_length = grn_plugin_charlen(ctx, (char *)token_tail, tokenizer->rest_length, tokenizer->query->encoding); token_size++; token_tail += char_length; if (tokenizer->use_vgram == VGRAM_QUAD) { if (token_tail < string_end && !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) { id = grn_table_get(ctx, tokenizer->vgram_table, (const char *)token_top, token_tail - token_top); if (id) { char_length = grn_plugin_charlen(ctx, (char *)token_tail, tokenizer->rest_length, tokenizer->query->encoding); token_size++; token_tail += char_length; } } else { if (tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) { grn_id tid; tid = grn_table_get(ctx, lexicon, (const char *)token_top, token_tail - token_top); if (tid == GRN_ID_NIL) { int added; grn_table_add(ctx, lexicon, (const char *)token_top, token_tail - token_top, &added); } status |= GRN_TOKEN_FORCE_PREFIX; } } } } else { if (tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) { grn_id tid; tid = grn_table_get(ctx, lexicon, (const char *)token_top, token_tail - token_top); if (tid == GRN_ID_NIL) { int added; grn_table_add(ctx, lexicon, (const char *)token_top, token_tail - token_top, &added); } status |= GRN_TOKEN_FORCE_PREFIX; } } } } if (token_top == token_tail || token_next == string_end) { status |= GRN_TOKEN_LAST; } if (token_tail == string_end) { status |= GRN_TOKEN_REACH_END; } if (!is_token_grouped && !is_token_hit && token_size < tokenizer->ngram_unit) { status |= GRN_TOKEN_UNMATURED; } if (tokenizer->pushed_token_tail && token_top < tokenizer->pushed_token_tail) { status |= GRN_TOKEN_OVERLAP; if (tokenizer->skip_overlap && !grn_ii_overlap_token_skip_enable && !(status & GRN_TOKEN_REACH_END) && !(status & GRN_TOKEN_SKIP_WITH_POSITION) && tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) { if (token_tail <= tokenizer->pushed_token_tail) { status |= GRN_TOKEN_SKIP; } else { if (!is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) { status |= GRN_TOKEN_SKIP; } } } } if (!(status & GRN_TOKEN_SKIP) && !(status & GRN_TOKEN_SKIP_WITH_POSITION)) { tokenizer->pushed_token_tail = token_tail; } tokenizer->next = token_next; tokenizer->rest_length = string_end - token_next; tokenizer->ctypes_next = tokenizer->ctypes_next + ctypes_skip_size; grn_tokenizer_token_push(ctx, &(tokenizer->token), (const char *)token_top, token_tail - token_top, status); return NULL; }
static grn_obj * regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { int char_len; grn_token_status status = 0; grn_regexp_tokenizer *tokenizer = user_data->ptr; unsigned int n_characters = 0; int ngram_unit = 2; grn_obj *buffer = &(tokenizer->buffer); const char *current = tokenizer->next; const char *end = tokenizer->end; const const uint_least8_t *char_types = tokenizer->char_types; grn_tokenize_mode mode = tokenizer->query->tokenize_mode; grn_bool is_begin = tokenizer->is_begin; grn_bool is_start_token = tokenizer->is_start_token; grn_bool break_by_blank = GRN_FALSE; grn_bool break_by_end_mark = GRN_FALSE; GRN_BULK_REWIND(buffer); tokenizer->is_begin = GRN_FALSE; tokenizer->is_start_token = GRN_FALSE; if (char_types) { char_types += tokenizer->nth_char; } if (mode != GRN_TOKEN_GET) { if (is_begin) { grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_BEGIN_MARK_UTF8, GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN, status); return NULL; } if (tokenizer->is_end) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_END_MARK_UTF8, GRN_TOKENIZER_END_MARK_UTF8_LEN, status); return NULL; } if (is_start_token) { if (char_types && GRN_STR_ISBLANK(char_types[-1])) { status |= GRN_TOKEN_SKIP; grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status); return NULL; } } } char_len = grn_charlen_(ctx, current, end, tokenizer->query->encoding); if (char_len == 0) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status); return NULL; } if (mode == GRN_TOKEN_GET) { if (is_begin && char_len == GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN && memcmp(current, GRN_TOKENIZER_BEGIN_MARK_UTF8, char_len) == 0) { n_characters++; GRN_TEXT_PUT(ctx, buffer, current, char_len); current += char_len; tokenizer->next = current; tokenizer->nth_char++; if (current == end) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; } grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_BEGIN_MARK_UTF8, GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN, status); return NULL; } if (current + char_len == end && char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN && memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_END_MARK_UTF8, GRN_TOKENIZER_END_MARK_UTF8_LEN, status); return NULL; } } while (GRN_TRUE) { n_characters++; GRN_TEXT_PUT(ctx, buffer, current, char_len); current += char_len; if (n_characters == 1) { tokenizer->next = current; tokenizer->nth_char++; } if (char_types) { uint_least8_t char_type; char_type = char_types[0]; char_types++; if (GRN_STR_ISBLANK(char_type)) { break_by_blank = GRN_TRUE; } } char_len = grn_charlen_(ctx, (const char *)current, (const char *)end, tokenizer->query->encoding); if (char_len == 0) { break; } if (mode == GRN_TOKEN_GET && current + char_len == end && char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN && memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) { break_by_end_mark = GRN_TRUE; } if (break_by_blank || break_by_end_mark) { break; } if (n_characters == ngram_unit) { break; } } if (tokenizer->is_overlapping) { status |= GRN_TOKEN_OVERLAP; } if (n_characters < ngram_unit) { status |= GRN_TOKEN_UNMATURED; } tokenizer->is_overlapping = (n_characters > 1); if (mode == GRN_TOKEN_GET) { if (current == end) { tokenizer->is_end = GRN_TRUE; status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; if (status & GRN_TOKEN_UNMATURED) { status |= GRN_TOKEN_FORCE_PREFIX; } } else { if (break_by_blank) { tokenizer->get.n_skip_tokens = 0; tokenizer->is_start_token = GRN_TRUE; } else if (break_by_end_mark) { if (!is_start_token && (status & GRN_TOKEN_UNMATURED)) { status |= GRN_TOKEN_SKIP; } } else if (tokenizer->get.n_skip_tokens > 0) { tokenizer->get.n_skip_tokens--; status |= GRN_TOKEN_SKIP; } else { tokenizer->get.n_skip_tokens = ngram_unit - 1; } } } else { if (tokenizer->next == end) { tokenizer->is_end = GRN_TRUE; } if (break_by_blank) { tokenizer->is_start_token = GRN_TRUE; } } grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TEXT_VALUE(buffer), GRN_TEXT_LEN(buffer), status); return NULL; }