static int forward_grouped_token_tail(grn_ctx *ctx, grn_yangram_tokenizer *tokenizer, const unsigned char *ctypes, const unsigned char **token_tail) { int token_size = 0; unsigned int char_length; unsigned int rest_length = tokenizer->rest_length; if (ctypes && tokenizer->split_alpha == GRN_FALSE && GRN_STR_CTYPE(*ctypes) == GRN_CHAR_ALPHA) { while ((char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length, tokenizer->query->encoding))) { token_size++; *token_tail += char_length; rest_length -= char_length; if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*ctypes)) { break; } if (GRN_STR_CTYPE(*++ctypes) != GRN_CHAR_ALPHA) { break; } } } else if (ctypes && tokenizer->split_digit == GRN_FALSE && GRN_STR_CTYPE(*ctypes) == GRN_CHAR_DIGIT) { while ((char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length, tokenizer->query->encoding))) { token_size++; *token_tail += char_length; rest_length -= char_length; if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*ctypes)) { break; } if (GRN_STR_CTYPE(*++ctypes) != GRN_CHAR_DIGIT) { break; } } } else if (ctypes && tokenizer->split_symbol == GRN_FALSE && GRN_STR_CTYPE(*ctypes) == GRN_CHAR_SYMBOL) { while ((char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length, tokenizer->query->encoding))) { token_size++; *token_tail += char_length; rest_length -= char_length; if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*ctypes)) { break; } if (GRN_STR_CTYPE(*++ctypes) != GRN_CHAR_SYMBOL) { break; } } } return token_size; }
static int forward_ngram_token_tail(grn_ctx *ctx, grn_yangram_tokenizer *tokenizer, const unsigned char *ctypes, const unsigned char **token_tail) { int token_size = 0; unsigned int char_length; unsigned int rest_length = tokenizer->rest_length; if ((char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length, tokenizer->query->encoding))) { token_size++; *token_tail += char_length; rest_length -= char_length; while (token_size < tokenizer->ngram_unit && (char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length, tokenizer->query->encoding))) { if (tokenizer->phrase_table) { if (tokenizer->nhits > 0 && tokenizer->current_hit < tokenizer->nhits && *token_tail - (const unsigned char *)tokenizer->scan_start == tokenizer->hits[tokenizer->current_hit].offset) { break; } } if (ctypes) { if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*ctypes)) { break; } ctypes++; if ((tokenizer->split_alpha == GRN_FALSE && GRN_STR_CTYPE(*ctypes) == GRN_CHAR_ALPHA) || (tokenizer->split_digit == GRN_FALSE && GRN_STR_CTYPE(*ctypes) == GRN_CHAR_DIGIT) || (tokenizer->split_symbol == GRN_FALSE && GRN_STR_CTYPE(*ctypes) == GRN_CHAR_SYMBOL)) { break; } } token_size++; *token_tail += char_length; rest_length -= char_length; } } return token_size; }
static grn_obj * sample_next(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args, grn_user_data *user_data) { sample_tokenizer *tokenizer = user_data->ptr; grn_encoding encoding = tokenizer->query->encoding; grn_tokenizer_status status; const char *token; int token_length; token = tokenizer->next; token_length = grn_plugin_charlen(ctx, token, tokenizer->rest, encoding); if (token_length == 0 || tokenizer->rest - token_length == 0) { status = GRN_TOKENIZER_LAST; } else { status = GRN_TOKENIZER_CONTINUE; } grn_tokenizer_token_push(ctx, &(tokenizer->token), token, token_length, status); tokenizer->next += token_length; tokenizer->rest -= token_length; return NULL; }
static int forward_scan_hit_token_tail(grn_ctx *ctx, grn_yangram_tokenizer *tokenizer, const unsigned char **token_tail, unsigned int scan_length) { int token_size = 0; unsigned int char_length; unsigned int rest_length = tokenizer->rest_length; const unsigned char *token_top = *token_tail; while ((char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length, tokenizer->query->encoding))) { token_size++; *token_tail += char_length; rest_length -= char_length; if (*token_tail - token_top >= scan_length) { break; } } return token_size; }
/* Just for backward compatibility. See grn_plugin_charlen() instead. */ int grn_tokenizer_charlen(grn_ctx *ctx, const char *str_ptr, unsigned int str_length, grn_encoding encoding) { return grn_plugin_charlen(ctx, str_ptr, str_length, encoding); }
static grn_obj * yangram_next(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args, grn_user_data *user_data) { grn_yangram_tokenizer *tokenizer = user_data->ptr; const unsigned char *string_end = tokenizer->end; const unsigned char *token_top = tokenizer->next; const unsigned char *token_next = token_top; const unsigned char *token_tail = token_top; int token_size = 0; grn_bool is_token_grouped = GRN_FALSE; const unsigned char *token_ctypes = NULL; unsigned int ctypes_skip_size; int char_length = 0; grn_tokenizer_status status = 0; grn_bool is_token_hit = GRN_FALSE; grn_obj *lexicon = args[0]; if (tokenizer->phrase_table) { if (tokenizer->nhits > 0 && token_top - (const unsigned char *)tokenizer->scan_start > tokenizer->hits[tokenizer->current_hit].offset) { tokenizer->current_hit++; } if (tokenizer->current_hit >= tokenizer->nhits) { tokenizer->scan_start = tokenizer->scan_rest; unsigned int scan_rest_length = tokenizer->end - (const unsigned char *)tokenizer->scan_rest; if (scan_rest_length > 0) { tokenizer->nhits = grn_pat_scan(ctx, (grn_pat *)tokenizer->phrase_table, tokenizer->scan_rest, scan_rest_length, tokenizer->hits, MAX_N_HITS, &(tokenizer->scan_rest)); tokenizer->current_hit = 0; } } if (tokenizer->nhits > 0 && tokenizer->current_hit < tokenizer->nhits && token_top - (const unsigned char *)tokenizer->scan_start == tokenizer->hits[tokenizer->current_hit].offset) { is_token_hit = GRN_TRUE; } } if (tokenizer->ctypes) { token_ctypes = tokenizer->ctypes + tokenizer->ctypes_next; } else { token_ctypes = NULL; } if (is_token_hit) { token_size = forward_scan_hit_token_tail(ctx, tokenizer, &token_tail, tokenizer->hits[tokenizer->current_hit].length); token_next = token_tail; tokenizer->current_hit++; } else { is_token_grouped = is_token_group(tokenizer, token_ctypes); if (is_token_grouped) { token_size = forward_grouped_token_tail(ctx, tokenizer, token_ctypes, &token_tail); token_next = token_tail; } else { token_size = forward_ngram_token_tail(ctx, tokenizer, token_ctypes, &token_tail); char_length = grn_plugin_charlen(ctx, (char *)token_next, tokenizer->rest_length, tokenizer->query->encoding); token_next += char_length; } } if (token_top == token_tail || token_next == string_end) { ctypes_skip_size = 0; } else { if (is_token_grouped || is_token_hit) { ctypes_skip_size = token_size; } else { ctypes_skip_size = 1; } } if (tokenizer->use_vgram > 0 && !is_token_grouped) { grn_bool maybe_vgram = GRN_FALSE; grn_id id; id = grn_table_get(ctx, tokenizer->vgram_table, (const char *)token_top, token_tail - token_top); if (id) { maybe_vgram = GRN_TRUE; } if (tokenizer->use_vgram >= VGRAM_BOTH && !maybe_vgram) { if (token_tail < string_end && !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) { grn_id id; const unsigned char *token_next_tail; char_length = grn_plugin_charlen(ctx, (char *)token_tail, tokenizer->rest_length, tokenizer->query->encoding); token_next_tail = token_tail + char_length; id = grn_table_get(ctx, tokenizer->vgram_table, (const char *)token_next, token_next_tail - token_next); if (id) { maybe_vgram = GRN_TRUE; } } else if (token_tail == string_end && tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) { maybe_vgram = GRN_TRUE; } } if (maybe_vgram) { if (token_tail < string_end && !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) { char_length = grn_plugin_charlen(ctx, (char *)token_tail, tokenizer->rest_length, tokenizer->query->encoding); token_size++; token_tail += char_length; if (tokenizer->use_vgram == VGRAM_QUAD) { if (token_tail < string_end && !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) { id = grn_table_get(ctx, tokenizer->vgram_table, (const char *)token_top, token_tail - token_top); if (id) { char_length = grn_plugin_charlen(ctx, (char *)token_tail, tokenizer->rest_length, tokenizer->query->encoding); token_size++; token_tail += char_length; } } else { if (tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) { grn_id tid; tid = grn_table_get(ctx, lexicon, (const char *)token_top, token_tail - token_top); if (tid == GRN_ID_NIL) { int added; grn_table_add(ctx, lexicon, (const char *)token_top, token_tail - token_top, &added); } status |= GRN_TOKEN_FORCE_PREFIX; } } } } else { if (tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) { grn_id tid; tid = grn_table_get(ctx, lexicon, (const char *)token_top, token_tail - token_top); if (tid == GRN_ID_NIL) { int added; grn_table_add(ctx, lexicon, (const char *)token_top, token_tail - token_top, &added); } status |= GRN_TOKEN_FORCE_PREFIX; } } } } if (token_top == token_tail || token_next == string_end) { status |= GRN_TOKEN_LAST; } if (token_tail == string_end) { status |= GRN_TOKEN_REACH_END; } if (!is_token_grouped && !is_token_hit && token_size < tokenizer->ngram_unit) { status |= GRN_TOKEN_UNMATURED; } if (tokenizer->pushed_token_tail && token_top < tokenizer->pushed_token_tail) { status |= GRN_TOKEN_OVERLAP; if (tokenizer->skip_overlap && !grn_ii_overlap_token_skip_enable && !(status & GRN_TOKEN_REACH_END) && !(status & GRN_TOKEN_SKIP_WITH_POSITION) && tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) { if (token_tail <= tokenizer->pushed_token_tail) { status |= GRN_TOKEN_SKIP; } else { if (!is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) { status |= GRN_TOKEN_SKIP; } } } } if (!(status & GRN_TOKEN_SKIP) && !(status & GRN_TOKEN_SKIP_WITH_POSITION)) { tokenizer->pushed_token_tail = token_tail; } tokenizer->next = token_next; tokenizer->rest_length = string_end - token_next; tokenizer->ctypes_next = tokenizer->ctypes_next + ctypes_skip_size; grn_tokenizer_token_push(ctx, &(tokenizer->token), (const char *)token_top, token_tail - token_top, status); return NULL; }