static int forward_grouped_token_tail(grn_ctx *ctx, grn_yangram_tokenizer *tokenizer, const unsigned char *ctypes, const unsigned char **token_tail) { int token_size = 0; unsigned int char_length; unsigned int rest_length = tokenizer->rest_length; if (ctypes && tokenizer->split_alpha == GRN_FALSE && GRN_STR_CTYPE(*ctypes) == GRN_CHAR_ALPHA) { while ((char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length, tokenizer->query->encoding))) { token_size++; *token_tail += char_length; rest_length -= char_length; if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*ctypes)) { break; } if (GRN_STR_CTYPE(*++ctypes) != GRN_CHAR_ALPHA) { break; } } } else if (ctypes && tokenizer->split_digit == GRN_FALSE && GRN_STR_CTYPE(*ctypes) == GRN_CHAR_DIGIT) { while ((char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length, tokenizer->query->encoding))) { token_size++; *token_tail += char_length; rest_length -= char_length; if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*ctypes)) { break; } if (GRN_STR_CTYPE(*++ctypes) != GRN_CHAR_DIGIT) { break; } } } else if (ctypes && tokenizer->split_symbol == GRN_FALSE && GRN_STR_CTYPE(*ctypes) == GRN_CHAR_SYMBOL) { while ((char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length, tokenizer->query->encoding))) { token_size++; *token_tail += char_length; rest_length -= char_length; if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*ctypes)) { break; } if (GRN_STR_CTYPE(*++ctypes) != GRN_CHAR_SYMBOL) { break; } } } return token_size; }
static grn_bool is_group_border(GNUC_UNUSED grn_ctx *ctx, grn_yangram_tokenizer *tokenizer, const unsigned char *token_tail, const unsigned char *ctypes, int token_size) { if (ctypes) { ctypes = ctypes + token_size - 1; if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*ctypes)) { return GRN_TRUE; } ctypes++; } if (ctypes) { if (is_token_group(tokenizer, ctypes)) { return GRN_TRUE; } } if (tokenizer->phrase_table) { if (tokenizer->nhits > 0 && tokenizer->current_hit < tokenizer->nhits && token_tail - (const unsigned char *)tokenizer->scan_start == tokenizer->hits[tokenizer->current_hit].offset) { return GRN_TRUE; } } return GRN_FALSE; }
static int forward_ngram_token_tail(grn_ctx *ctx, grn_yangram_tokenizer *tokenizer, const unsigned char *ctypes, const unsigned char **token_tail) { int token_size = 0; unsigned int char_length; unsigned int rest_length = tokenizer->rest_length; if ((char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length, tokenizer->query->encoding))) { token_size++; *token_tail += char_length; rest_length -= char_length; while (token_size < tokenizer->ngram_unit && (char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length, tokenizer->query->encoding))) { if (tokenizer->phrase_table) { if (tokenizer->nhits > 0 && tokenizer->current_hit < tokenizer->nhits && *token_tail - (const unsigned char *)tokenizer->scan_start == tokenizer->hits[tokenizer->current_hit].offset) { break; } } if (ctypes) { if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*ctypes)) { break; } ctypes++; if ((tokenizer->split_alpha == GRN_FALSE && GRN_STR_CTYPE(*ctypes) == GRN_CHAR_ALPHA) || (tokenizer->split_digit == GRN_FALSE && GRN_STR_CTYPE(*ctypes) == GRN_CHAR_DIGIT) || (tokenizer->split_symbol == GRN_FALSE && GRN_STR_CTYPE(*ctypes) == GRN_CHAR_SYMBOL)) { break; } } token_size++; *token_tail += char_length; rest_length -= char_length; } } return token_size; }
static grn_obj * ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { size_t cl; grn_ngram_tokenizer *tokenizer = user_data->ptr; const unsigned char *p = tokenizer->next, *r = p, *e = tokenizer->end; int32_t len = 0, pos = tokenizer->pos + tokenizer->skip; grn_token_status status = 0; const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL; if (cp && tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_ALPHA) { break; } } tokenizer->next = r; tokenizer->overlap = 0; } else if (cp && tokenizer->uni_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_DIGIT) { break; } } tokenizer->next = r; tokenizer->overlap = 0; } else if (cp && tokenizer->uni_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_SYMBOL) { break; } } tokenizer->next = r; tokenizer->overlap = 0; } else { #ifdef PRE_DEFINED_UNSPLIT_WORDS const unsigned char *key = NULL; // todo : grn_pat_lcp_search if ((tid = grn_sym_common_prefix_search(sym, p))) { if (!(key = _grn_sym_key(sym, tid))) { tokenizer->status = GRN_TOKEN_CURSOR_NOT_FOUND; return NULL; } len = grn_str_len(key, tokenizer->query->encoding, NULL); } r = p + grn_charlen_(ctx, p, e, tokenizer->query->encoding); if (tid && (len > 1 || r == p)) { if (r != p && pos + len - 1 <= tokenizer->tail) { continue; } p += strlen(key); if (!*p && tokenizer->mode == GRN_TOKEN_GET) { tokenizer->status = GRN_TOKEN_CURSOR_DONE; } } #endif /* PRE_DEFINED_UNSPLIT_WORDS */ if ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; tokenizer->next = r; while (len < tokenizer->ngram_unit && (cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { if (cp) { if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } cp++; if ((tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) || (tokenizer->uni_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) || (tokenizer->uni_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL)) { break; } } len++; r += cl; } if (tokenizer->overlap) { status |= GRN_TOKEN_OVERLAP; } if (len < tokenizer->ngram_unit) { status |= GRN_TOKEN_UNMATURED; } tokenizer->overlap = (len > 1) ? 1 : 0; } } tokenizer->pos = pos; tokenizer->len = len; tokenizer->tail = pos + len - 1; if (p == r || tokenizer->next == e) { tokenizer->skip = 0; status |= GRN_TOKEN_LAST; } else { tokenizer->skip = tokenizer->overlap ? 1 : len; } if (r == e) { status |= GRN_TOKEN_REACH_END; } grn_tokenizer_token_push(ctx, &(tokenizer->token), (const char *)p, r - p, status); return NULL; }
static grn_rc ngram_next(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data) { size_t cl; grn_ngram_tokenizer *token = user_data->ptr; const unsigned char *p = token->next, *r = p, *e = token->end; int32_t len = 0, pos = token->pos + token->skip, status = 0; uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL; if (cp && token->uni_alpha && GRN_STR_CTYPE(*cp) == grn_str_alpha) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; if (GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != grn_str_alpha) { break; } } token->next = r; token->overlap = 0; } else if (cp && token->uni_digit && GRN_STR_CTYPE(*cp) == grn_str_digit) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; if (GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != grn_str_digit) { break; } } token->next = r; token->overlap = 0; } else if (cp && token->uni_symbol && GRN_STR_CTYPE(*cp) == grn_str_symbol) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; if (GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != grn_str_symbol) { break; } } token->next = r; token->overlap = 0; } else { #ifdef PRE_DEFINED_UNSPLIT_WORDS const unsigned char *key = NULL; // todo : grn_pat_lcp_search if ((tid = grn_sym_common_prefix_search(sym, p))) { if (!(key = _grn_sym_key(sym, tid))) { token->status = grn_token_not_found; return GRN_ID_NIL; } len = grn_str_len(key, token->encoding, NULL); } r = p + grn_charlen_(ctx, p, e, token->encoding); if (tid && (len > 1 || r == p)) { if (r != p && pos + len - 1 <= token->tail) { continue; } p += strlen(key); if (!*p && !token->add) { token->status = grn_token_done; } } #endif /* PRE_DEFINED_UNSPLIT_WORDS */ if ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; token->next = r; while (len < token->ngram_unit && (cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { if (cp) { if (GRN_STR_ISBLANK(*cp)) { break; } cp++; if ((token->uni_alpha && GRN_STR_CTYPE(*cp) == grn_str_alpha) || (token->uni_digit && GRN_STR_CTYPE(*cp) == grn_str_digit) || (token->uni_symbol && GRN_STR_CTYPE(*cp) == grn_str_symbol)) { break; } } len++; r += cl; } if (token->overlap) { status |= GRN_TOKEN_OVERLAP; } if (len < token->ngram_unit) { status |= GRN_TOKEN_UNMATURED; } token->overlap = 1; } } token->pos = pos; token->len = len; token->tail = pos + len - 1; if (p == r || r == e) { token->skip = 0; status |= GRN_TOKEN_LAST; } else { token->skip = token->overlap ? 1 : len; } GRN_TEXT_SET_REF(&token->curr_, p, r - p); GRN_UINT32_SET(ctx, &token->stat_, status); grn_ctx_push(ctx, &token->curr_); grn_ctx_push(ctx, &token->stat_); return GRN_SUCCESS; }
static grn_obj * regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { int char_len; grn_token_status status = 0; grn_regexp_tokenizer *tokenizer = user_data->ptr; unsigned int n_characters = 0; int ngram_unit = 2; grn_obj *buffer = &(tokenizer->buffer); const char *current = tokenizer->next; const char *end = tokenizer->end; const const uint_least8_t *char_types = tokenizer->char_types; grn_tokenize_mode mode = tokenizer->query->tokenize_mode; grn_bool is_begin = tokenizer->is_begin; grn_bool is_start_token = tokenizer->is_start_token; grn_bool break_by_blank = GRN_FALSE; grn_bool break_by_end_mark = GRN_FALSE; GRN_BULK_REWIND(buffer); tokenizer->is_begin = GRN_FALSE; tokenizer->is_start_token = GRN_FALSE; if (char_types) { char_types += tokenizer->nth_char; } if (mode != GRN_TOKEN_GET) { if (is_begin) { grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_BEGIN_MARK_UTF8, GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN, status); return NULL; } if (tokenizer->is_end) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_END_MARK_UTF8, GRN_TOKENIZER_END_MARK_UTF8_LEN, status); return NULL; } if (is_start_token) { if (char_types && GRN_STR_ISBLANK(char_types[-1])) { status |= GRN_TOKEN_SKIP; grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status); return NULL; } } } char_len = grn_charlen_(ctx, current, end, tokenizer->query->encoding); if (char_len == 0) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status); return NULL; } if (mode == GRN_TOKEN_GET) { if (is_begin && char_len == GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN && memcmp(current, GRN_TOKENIZER_BEGIN_MARK_UTF8, char_len) == 0) { n_characters++; GRN_TEXT_PUT(ctx, buffer, current, char_len); current += char_len; tokenizer->next = current; tokenizer->nth_char++; if (current == end) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; } grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_BEGIN_MARK_UTF8, GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN, status); return NULL; } if (current + char_len == end && char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN && memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_END_MARK_UTF8, GRN_TOKENIZER_END_MARK_UTF8_LEN, status); return NULL; } } while (GRN_TRUE) { n_characters++; GRN_TEXT_PUT(ctx, buffer, current, char_len); current += char_len; if (n_characters == 1) { tokenizer->next = current; tokenizer->nth_char++; } if (char_types) { uint_least8_t char_type; char_type = char_types[0]; char_types++; if (GRN_STR_ISBLANK(char_type)) { break_by_blank = GRN_TRUE; } } char_len = grn_charlen_(ctx, (const char *)current, (const char *)end, tokenizer->query->encoding); if (char_len == 0) { break; } if (mode == GRN_TOKEN_GET && current + char_len == end && char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN && memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) { break_by_end_mark = GRN_TRUE; } if (break_by_blank || break_by_end_mark) { break; } if (n_characters == ngram_unit) { break; } } if (tokenizer->is_overlapping) { status |= GRN_TOKEN_OVERLAP; } if (n_characters < ngram_unit) { status |= GRN_TOKEN_UNMATURED; } tokenizer->is_overlapping = (n_characters > 1); if (mode == GRN_TOKEN_GET) { if (current == end) { tokenizer->is_end = GRN_TRUE; status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; if (status & GRN_TOKEN_UNMATURED) { status |= GRN_TOKEN_FORCE_PREFIX; } } else { if (break_by_blank) { tokenizer->get.n_skip_tokens = 0; tokenizer->is_start_token = GRN_TRUE; } else if (break_by_end_mark) { if (!is_start_token && (status & GRN_TOKEN_UNMATURED)) { status |= GRN_TOKEN_SKIP; } } else if (tokenizer->get.n_skip_tokens > 0) { tokenizer->get.n_skip_tokens--; status |= GRN_TOKEN_SKIP; } else { tokenizer->get.n_skip_tokens = ngram_unit - 1; } } } else { if (tokenizer->next == end) { tokenizer->is_end = GRN_TRUE; } if (break_by_blank) { tokenizer->is_start_token = GRN_TRUE; } } grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TEXT_VALUE(buffer), GRN_TEXT_LEN(buffer), status); return NULL; }