grn_bool grn_tokenizer_have_tokenized_delimiter(grn_ctx *ctx, const char *str_ptr, unsigned int str_length, grn_encoding encoding) { int char_length; const char *current = str_ptr; const char *end = str_ptr + str_length; if (encoding != GRN_ENC_UTF8) { return GRN_FALSE; } if (str_length == 0) { return GRN_FALSE; } while ((char_length = grn_charlen_(ctx, current, end, encoding)) > 0) { if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length, encoding)) { return GRN_TRUE; } current += char_length; } return GRN_FALSE; }
static grn_rc mecab_next(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data) { size_t cl; grn_mecab_tokenizer *token = user_data->ptr; const unsigned char *p = token->next, *r; const unsigned char *e = token->end; for (r = p; r < e; r += cl) { if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { token->next = (unsigned char *)e; break; } if (grn_isspace((const char *)r, token->encoding)) { const unsigned char *q = r; while ((cl = grn_isspace((const char *)q, token->encoding))) { q += cl; } token->next = (unsigned char *)q; break; } } GRN_TEXT_SET_REF(&token->curr_, p, r - p); GRN_UINT32_SET(ctx, &token->stat_, r == e ? GRN_TOKEN_LAST : 0); grn_ctx_push(ctx, &token->curr_); grn_ctx_push(ctx, &token->stat_); return GRN_SUCCESS; }
const char * grn_tokenizer_next_by_tokenized_delimiter(grn_ctx *ctx, grn_token *token, const char *str_ptr, unsigned int str_length, grn_encoding encoding) { size_t char_length = 0; const char *start = str_ptr; const char *current; const char *end = str_ptr + str_length; const char *next_start = NULL; for (current = start; current < end; current += char_length) { char_length = grn_charlen_(ctx, current, end, encoding); if (char_length == 0) { break; } if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length, encoding)) { next_start = str_ptr + (current - start + char_length); break; } } grn_token_set_data(ctx, token, start, current - start); if (current == end) { grn_token_set_status(ctx, token, GRN_TOKEN_LAST); } else { grn_token_set_status(ctx, token, GRN_TOKEN_CONTINUE); } return next_start; }
/* This function returns tokens one by one. */ static grn_obj * mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { size_t cl; /* grn_obj *table = args[0]; */ grn_mecab_tokenizer *token = user_data->ptr; char *p = token->next, *r; char *e = token->end; for (r = p; r < e; r += cl) { if (!(cl = grn_charlen_(ctx, r, e, token->encoding))) { token->next = e; break; } if (grn_isspace(r, token->encoding)) { char *q = r; while ((cl = grn_isspace(q, token->encoding))) { q += cl; } token->next = q; break; } } GRN_TEXT_SET_REF(&token->curr_, p, r - p); GRN_UINT32_SET(ctx, &token->stat_, r == e ? GRN_TOKEN_LAST : 0); grn_ctx_push(ctx, &token->curr_); grn_ctx_push(ctx, &token->stat_); return NULL; }
static grn_obj * delimited_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_delimited_tokenizer *tokenizer = user_data->ptr; if (tokenizer->have_tokenized_delimiter) { unsigned int rest_length; rest_length = tokenizer->end - tokenizer->next; tokenizer->next = (unsigned char *)grn_tokenizer_tokenized_delimiter_next( ctx, &(tokenizer->token), (const char *)tokenizer->next, rest_length, tokenizer->query->encoding); } else { size_t cl; const unsigned char *p = tokenizer->next, *r; const unsigned char *e = tokenizer->end; grn_token_status status; for (r = p; r < e; r += cl) { if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { tokenizer->next = (unsigned char *)e; break; } { grn_bool found_delimiter = GRN_FALSE; const unsigned char *current_end = r; while (current_end + tokenizer->delimiter_len <= e && !memcmp(current_end, tokenizer->delimiter, tokenizer->delimiter_len)) { current_end += tokenizer->delimiter_len; tokenizer->next = current_end; found_delimiter = GRN_TRUE; } if (found_delimiter) { break; } } } if (r == e) { status = GRN_TOKEN_LAST; } else { status = GRN_TOKEN_CONTINUE; } grn_tokenizer_token_push(ctx, &(tokenizer->token), (const char *)p, r - p, status); } return NULL; }
/* This function returns tokens one by one. */ static grn_obj * mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { /* grn_obj *table = args[0]; */ grn_mecab_tokenizer *tokenizer = user_data->ptr; grn_encoding encoding = tokenizer->query->encoding; if (tokenizer->query->have_tokenized_delimiter) { tokenizer->next = grn_tokenizer_tokenized_delimiter_next(ctx, &(tokenizer->token), tokenizer->next, tokenizer->end - tokenizer->next, encoding); } else { size_t cl; const char *p = tokenizer->next, *r; const char *e = tokenizer->end; grn_tokenizer_status status; for (r = p; r < e; r += cl) { int space_len; space_len = grn_isspace(r, encoding); if (space_len > 0 && r == p) { cl = space_len; p = r + cl; continue; } if (!(cl = grn_charlen_(ctx, r, e, encoding))) { tokenizer->next = e; break; } if (space_len > 0) { const char *q = r + space_len; while (q < e && (space_len = grn_isspace(q, encoding))) { q += space_len; } tokenizer->next = q; break; } } if (r == e || tokenizer->next == e) { status = GRN_TOKENIZER_LAST; } else { status = GRN_TOKENIZER_CONTINUE; } grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status); } return NULL; }
static grn_obj * delimited_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_delimited_tokenizer *tokenizer = user_data->ptr; if (tokenizer->have_tokenized_delimiter) { unsigned int rest_length; rest_length = tokenizer->end - tokenizer->next; tokenizer->next = grn_tokenizer_tokenized_delimiter_next(ctx, &(tokenizer->token), tokenizer->next, rest_length, tokenizer->query->encoding); } else { size_t cl; const unsigned char *p = tokenizer->next, *r; const unsigned char *e = tokenizer->end; grn_tokenizer_status status; for (r = p; r < e; r += cl) { if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { tokenizer->next = (unsigned char *)e; break; } if (r + tokenizer->delimiter_len <= e && !memcmp(r, tokenizer->delimiter, tokenizer->delimiter_len)) { tokenizer->next = r + tokenizer->delimiter_len; break; } } if (r == e) { status = GRN_TOKENIZER_LAST; } else { status = GRN_TOKENIZER_CONTINUE; } grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status); } return NULL; }
const char * grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx, grn_tokenizer_token *token, const char *str_ptr, unsigned int str_length, grn_encoding encoding) { size_t char_length = 0; const char *start = str_ptr; const char *current; const char *end = str_ptr + str_length; const char *next_start = NULL; unsigned int token_length; grn_token_status status; for (current = start; current < end; current += char_length) { char_length = grn_charlen_(ctx, current, end, encoding); if (char_length == 0) { break; } if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length, encoding)) { next_start = str_ptr + (current - start + char_length); break; } } token_length = current - start; if (current == end) { status = GRN_TOKENIZER_LAST; } else { status = GRN_TOKENIZER_CONTINUE; } grn_tokenizer_token_push(ctx, token, start, token_length, status); return next_start; }
static grn_obj * delimited_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { size_t cl; grn_delimited_tokenizer *token = user_data->ptr; const unsigned char *p = token->next, *r; const unsigned char *e = token->end; for (r = p; r < e; r += cl) { if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { token->next = (unsigned char *)e; break; } if (r + token->delimiter_len <= e && !memcmp(r, token->delimiter, token->delimiter_len)) { token->next = r + token->delimiter_len; break; } } GRN_TEXT_SET_REF(&token->curr_, p, r - p); GRN_UINT32_SET(ctx, &token->stat_, r == e ? GRN_TOKEN_LAST : 0); grn_ctx_push(ctx, &token->curr_); grn_ctx_push(ctx, &token->stat_); return NULL; }
static grn_obj * regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { int char_len; grn_token_status status = 0; grn_regexp_tokenizer *tokenizer = user_data->ptr; unsigned int n_characters = 0; int ngram_unit = 2; grn_obj *buffer = &(tokenizer->buffer); const char *current = tokenizer->next; const char *end = tokenizer->end; grn_tokenize_mode mode = tokenizer->query->tokenize_mode; grn_bool escaping = GRN_FALSE; GRN_BULK_REWIND(buffer); if (mode == GRN_TOKEN_GET) { if (tokenizer->get.have_begin) { grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_BEGIN_MARK_UTF8, GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN, status); tokenizer->get.have_begin = GRN_FALSE; return NULL; } if (tokenizer->is_end && tokenizer->get.have_end) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_END_MARK_UTF8, GRN_TOKENIZER_END_MARK_UTF8_LEN, status); return NULL; } } else { if (tokenizer->is_begin) { grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_BEGIN_MARK_UTF8, GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN, status); tokenizer->is_begin = GRN_FALSE; return NULL; } if (tokenizer->is_end) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_END_MARK_UTF8, GRN_TOKENIZER_END_MARK_UTF8_LEN, status); return NULL; } } char_len = grn_charlen_(ctx, current, end, tokenizer->query->encoding); if (char_len == 0) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status); return NULL; } while (GRN_TRUE) { if (!escaping && mode == GRN_TOKEN_GET && char_len == 1 && current[0] == '\\') { current += char_len; escaping = GRN_TRUE; } else { n_characters++; GRN_TEXT_PUT(ctx, buffer, current, char_len); current += char_len; escaping = GRN_FALSE; if (n_characters == 1) { tokenizer->next = current; } if (n_characters == ngram_unit) { break; } } char_len = grn_charlen_(ctx, (const char *)current, (const char *)end, tokenizer->query->encoding); if (char_len == 0) { break; } } if (tokenizer->is_overlapping) { status |= GRN_TOKEN_OVERLAP; } if (n_characters < ngram_unit) { status |= GRN_TOKEN_UNMATURED; } tokenizer->is_overlapping = (n_characters > 1); if (mode == GRN_TOKEN_GET) { if ((end - tokenizer->next) < ngram_unit) { if (tokenizer->get.have_end) { if (tokenizer->next == end) { tokenizer->is_end = GRN_TRUE; } if (status & GRN_TOKEN_UNMATURED) { if (tokenizer->is_first_token) { status |= GRN_TOKEN_FORCE_PREFIX; } else { status |= GRN_TOKEN_SKIP; } } } else { tokenizer->is_end = GRN_TRUE; status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; if (status & GRN_TOKEN_UNMATURED) { status |= GRN_TOKEN_FORCE_PREFIX; } } } else { if (tokenizer->get.n_skip_tokens > 0) { tokenizer->get.n_skip_tokens--; status |= GRN_TOKEN_SKIP; } else { tokenizer->get.n_skip_tokens = ngram_unit - 1; } } } else { if (tokenizer->next == end) { tokenizer->is_end = GRN_TRUE; } } grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TEXT_VALUE(buffer), GRN_TEXT_LEN(buffer), status); tokenizer->is_first_token = GRN_FALSE; return NULL; }
static grn_obj * regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { unsigned int normalize_flags = 0; grn_tokenizer_query *query; const char *normalized; unsigned int normalized_length_in_bytes; grn_regexp_tokenizer *tokenizer; query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags); if (!query) { return NULL; } tokenizer = GRN_MALLOC(sizeof(grn_regexp_tokenizer)); if (!tokenizer) { grn_tokenizer_query_close(ctx, query); ERR(GRN_NO_MEMORY_AVAILABLE, "[tokenizer][regexp] failed to allocate memory"); return NULL; } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); tokenizer->query = query; tokenizer->get.have_begin = GRN_FALSE; tokenizer->get.have_end = GRN_FALSE; tokenizer->get.n_skip_tokens = 0; tokenizer->is_begin = GRN_TRUE; tokenizer->is_end = GRN_FALSE; tokenizer->is_first_token = GRN_TRUE; tokenizer->is_overlapping = GRN_FALSE; grn_string_get_normalized(ctx, tokenizer->query->normalized_query, &normalized, &normalized_length_in_bytes, NULL); tokenizer->next = normalized; tokenizer->end = tokenizer->next + normalized_length_in_bytes; if (tokenizer->query->tokenize_mode == GRN_TOKEN_GET) { unsigned int query_length = tokenizer->query->length; if (query_length >= 2) { const char *query_string = tokenizer->query->ptr; grn_encoding encoding = tokenizer->query->encoding; if (query_string[0] == '\\' && query_string[1] == 'A') { tokenizer->get.have_begin = GRN_TRUE; /* TODO: It assumes that both "\\" and "A" are normalized to 1 characters. Normalizer may omit character or expand to multiple characters. */ tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end, encoding); tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end, encoding); } if (query_string[query_length - 2] == '\\' && query_string[query_length - 1] == 'z') { tokenizer->get.have_end = GRN_TRUE; /* TODO: It assumes that both "\\" and "z" are normalized to 1 byte characters. Normalizer may omit character or expand to multiple characters. */ tokenizer->end -= grn_charlen_(ctx, tokenizer->end - 1, tokenizer->end, encoding); tokenizer->end -= grn_charlen_(ctx, tokenizer->end - 1, tokenizer->end, encoding); } } } GRN_TEXT_INIT(&(tokenizer->buffer), 0); return NULL; }
static grn_obj * ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { size_t cl; grn_ngram_tokenizer *tokenizer = user_data->ptr; const unsigned char *p = tokenizer->next, *r = p, *e = tokenizer->end; int32_t len = 0, pos = tokenizer->pos + tokenizer->skip; grn_token_status status = 0; const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL; if (cp && tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_ALPHA) { break; } } tokenizer->next = r; tokenizer->overlap = 0; } else if (cp && tokenizer->uni_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_DIGIT) { break; } } tokenizer->next = r; tokenizer->overlap = 0; } else if (cp && tokenizer->uni_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_SYMBOL) { break; } } tokenizer->next = r; tokenizer->overlap = 0; } else { #ifdef PRE_DEFINED_UNSPLIT_WORDS const unsigned char *key = NULL; // todo : grn_pat_lcp_search if ((tid = grn_sym_common_prefix_search(sym, p))) { if (!(key = _grn_sym_key(sym, tid))) { tokenizer->status = GRN_TOKEN_CURSOR_NOT_FOUND; return NULL; } len = grn_str_len(key, tokenizer->query->encoding, NULL); } r = p + grn_charlen_(ctx, p, e, tokenizer->query->encoding); if (tid && (len > 1 || r == p)) { if (r != p && pos + len - 1 <= tokenizer->tail) { continue; } p += strlen(key); if (!*p && tokenizer->mode == GRN_TOKEN_GET) { tokenizer->status = GRN_TOKEN_CURSOR_DONE; } } #endif /* PRE_DEFINED_UNSPLIT_WORDS */ if ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; tokenizer->next = r; while (len < tokenizer->ngram_unit && (cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { if (cp) { if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } cp++; if ((tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) || (tokenizer->uni_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) || (tokenizer->uni_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL)) { break; } } len++; r += cl; } if (tokenizer->overlap) { status |= GRN_TOKEN_OVERLAP; } if (len < tokenizer->ngram_unit) { status |= GRN_TOKEN_UNMATURED; } tokenizer->overlap = (len > 1) ? 1 : 0; } } tokenizer->pos = pos; tokenizer->len = len; tokenizer->tail = pos + len - 1; if (p == r || tokenizer->next == e) { tokenizer->skip = 0; status |= GRN_TOKEN_LAST; } else { tokenizer->skip = tokenizer->overlap ? 1 : len; } if (r == e) { status |= GRN_TOKEN_REACH_END; } grn_tokenizer_token_push(ctx, &(tokenizer->token), (const char *)p, r - p, status); return NULL; }
static grn_rc ngram_next(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data) { size_t cl; grn_ngram_tokenizer *token = user_data->ptr; const unsigned char *p = token->next, *r = p, *e = token->end; int32_t len = 0, pos = token->pos + token->skip, status = 0; uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL; if (cp && token->uni_alpha && GRN_STR_CTYPE(*cp) == grn_str_alpha) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; if (GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != grn_str_alpha) { break; } } token->next = r; token->overlap = 0; } else if (cp && token->uni_digit && GRN_STR_CTYPE(*cp) == grn_str_digit) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; if (GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != grn_str_digit) { break; } } token->next = r; token->overlap = 0; } else if (cp && token->uni_symbol && GRN_STR_CTYPE(*cp) == grn_str_symbol) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; if (GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != grn_str_symbol) { break; } } token->next = r; token->overlap = 0; } else { #ifdef PRE_DEFINED_UNSPLIT_WORDS const unsigned char *key = NULL; // todo : grn_pat_lcp_search if ((tid = grn_sym_common_prefix_search(sym, p))) { if (!(key = _grn_sym_key(sym, tid))) { token->status = grn_token_not_found; return GRN_ID_NIL; } len = grn_str_len(key, token->encoding, NULL); } r = p + grn_charlen_(ctx, p, e, token->encoding); if (tid && (len > 1 || r == p)) { if (r != p && pos + len - 1 <= token->tail) { continue; } p += strlen(key); if (!*p && !token->add) { token->status = grn_token_done; } } #endif /* PRE_DEFINED_UNSPLIT_WORDS */ if ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; token->next = r; while (len < token->ngram_unit && (cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { if (cp) { if (GRN_STR_ISBLANK(*cp)) { break; } cp++; if ((token->uni_alpha && GRN_STR_CTYPE(*cp) == grn_str_alpha) || (token->uni_digit && GRN_STR_CTYPE(*cp) == grn_str_digit) || (token->uni_symbol && GRN_STR_CTYPE(*cp) == grn_str_symbol)) { break; } } len++; r += cl; } if (token->overlap) { status |= GRN_TOKEN_OVERLAP; } if (len < token->ngram_unit) { status |= GRN_TOKEN_UNMATURED; } token->overlap = 1; } } token->pos = pos; token->len = len; token->tail = pos + len - 1; if (p == r || r == e) { token->skip = 0; status |= GRN_TOKEN_LAST; } else { token->skip = token->overlap ? 1 : len; } GRN_TEXT_SET_REF(&token->curr_, p, r - p); GRN_UINT32_SET(ctx, &token->stat_, status); grn_ctx_push(ctx, &token->curr_); grn_ctx_push(ctx, &token->stat_); return GRN_SUCCESS; }
static grn_bool chunked_tokenize_utf8(grn_ctx *ctx, grn_mecab_tokenizer *tokenizer, const char *string, unsigned int string_bytes) { const char *chunk_start; const char *current; const char *last_delimiter; const char *string_end = string + string_bytes; grn_encoding encoding = tokenizer->query->encoding; if (string_bytes < grn_mecab_chunk_size_threshold) { return chunked_tokenize_utf8_chunk(ctx, tokenizer, string, string_bytes); } chunk_start = current = string; last_delimiter = NULL; while (current < string_end) { int space_bytes; int character_bytes; const char *current_character; space_bytes = grn_isspace(current, encoding); if (space_bytes > 0) { if (chunk_start != current) { grn_bool succeeded; succeeded = chunked_tokenize_utf8_chunk(ctx, tokenizer, chunk_start, current - chunk_start); if (!succeeded) { return succeeded; } } current += space_bytes; chunk_start = current; last_delimiter = NULL; continue; } character_bytes = grn_charlen_(ctx, current, string_end, encoding); if (character_bytes == 0) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab][chunk] " "invalid byte sequence: position=%d", (int)(current - string)); return GRN_FALSE; } current_character = current; current += character_bytes; if (is_delimiter_character(ctx, current_character, character_bytes)) { last_delimiter = current; } if ((current - chunk_start) >= grn_mecab_chunk_size_threshold) { grn_bool succeeded; if (last_delimiter) { succeeded = chunked_tokenize_utf8_chunk(ctx, tokenizer, chunk_start, last_delimiter - chunk_start); chunk_start = last_delimiter; } else { succeeded = chunked_tokenize_utf8_chunk(ctx, tokenizer, chunk_start, current - chunk_start); chunk_start = current; } if (!succeeded) { return succeeded; } last_delimiter = NULL; } } if (current == chunk_start) { return GRN_TRUE; } else { return chunked_tokenize_utf8_chunk(ctx, tokenizer, chunk_start, current - chunk_start); } }
static grn_obj * regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { int char_len; grn_token_status status = 0; grn_regexp_tokenizer *tokenizer = user_data->ptr; unsigned int n_characters = 0; int ngram_unit = 2; grn_obj *buffer = &(tokenizer->buffer); const char *current = tokenizer->next; const char *end = tokenizer->end; const const uint_least8_t *char_types = tokenizer->char_types; grn_tokenize_mode mode = tokenizer->query->tokenize_mode; grn_bool is_begin = tokenizer->is_begin; grn_bool is_start_token = tokenizer->is_start_token; grn_bool break_by_blank = GRN_FALSE; grn_bool break_by_end_mark = GRN_FALSE; GRN_BULK_REWIND(buffer); tokenizer->is_begin = GRN_FALSE; tokenizer->is_start_token = GRN_FALSE; if (char_types) { char_types += tokenizer->nth_char; } if (mode != GRN_TOKEN_GET) { if (is_begin) { grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_BEGIN_MARK_UTF8, GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN, status); return NULL; } if (tokenizer->is_end) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_END_MARK_UTF8, GRN_TOKENIZER_END_MARK_UTF8_LEN, status); return NULL; } if (is_start_token) { if (char_types && GRN_STR_ISBLANK(char_types[-1])) { status |= GRN_TOKEN_SKIP; grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status); return NULL; } } } char_len = grn_charlen_(ctx, current, end, tokenizer->query->encoding); if (char_len == 0) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status); return NULL; } if (mode == GRN_TOKEN_GET) { if (is_begin && char_len == GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN && memcmp(current, GRN_TOKENIZER_BEGIN_MARK_UTF8, char_len) == 0) { n_characters++; GRN_TEXT_PUT(ctx, buffer, current, char_len); current += char_len; tokenizer->next = current; tokenizer->nth_char++; if (current == end) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; } grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_BEGIN_MARK_UTF8, GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN, status); return NULL; } if (current + char_len == end && char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN && memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_END_MARK_UTF8, GRN_TOKENIZER_END_MARK_UTF8_LEN, status); return NULL; } } while (GRN_TRUE) { n_characters++; GRN_TEXT_PUT(ctx, buffer, current, char_len); current += char_len; if (n_characters == 1) { tokenizer->next = current; tokenizer->nth_char++; } if (char_types) { uint_least8_t char_type; char_type = char_types[0]; char_types++; if (GRN_STR_ISBLANK(char_type)) { break_by_blank = GRN_TRUE; } } char_len = grn_charlen_(ctx, (const char *)current, (const char *)end, tokenizer->query->encoding); if (char_len == 0) { break; } if (mode == GRN_TOKEN_GET && current + char_len == end && char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN && memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) { break_by_end_mark = GRN_TRUE; } if (break_by_blank || break_by_end_mark) { break; } if (n_characters == ngram_unit) { break; } } if (tokenizer->is_overlapping) { status |= GRN_TOKEN_OVERLAP; } if (n_characters < ngram_unit) { status |= GRN_TOKEN_UNMATURED; } tokenizer->is_overlapping = (n_characters > 1); if (mode == GRN_TOKEN_GET) { if (current == end) { tokenizer->is_end = GRN_TRUE; status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; if (status & GRN_TOKEN_UNMATURED) { status |= GRN_TOKEN_FORCE_PREFIX; } } else { if (break_by_blank) { tokenizer->get.n_skip_tokens = 0; tokenizer->is_start_token = GRN_TRUE; } else if (break_by_end_mark) { if (!is_start_token && (status & GRN_TOKEN_UNMATURED)) { status |= GRN_TOKEN_SKIP; } } else if (tokenizer->get.n_skip_tokens > 0) { tokenizer->get.n_skip_tokens--; status |= GRN_TOKEN_SKIP; } else { tokenizer->get.n_skip_tokens = ngram_unit - 1; } } } else { if (tokenizer->next == end) { tokenizer->is_end = GRN_TRUE; } if (break_by_blank) { tokenizer->is_start_token = GRN_TRUE; } } grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TEXT_VALUE(buffer), GRN_TEXT_LEN(buffer), status); return NULL; }