static grn_obj * ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { size_t cl; grn_ngram_tokenizer *tokenizer = user_data->ptr; const unsigned char *p = tokenizer->next, *r = p, *e = tokenizer->end; int32_t len = 0, pos = tokenizer->pos + tokenizer->skip; grn_token_status status = 0; const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL; if (cp && tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_ALPHA) { break; } } tokenizer->next = r; tokenizer->overlap = 0; } else if (cp && tokenizer->uni_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_DIGIT) { break; } } tokenizer->next = r; tokenizer->overlap = 0; } else if (cp && tokenizer->uni_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_SYMBOL) { break; } } tokenizer->next = r; tokenizer->overlap = 0; } else { #ifdef PRE_DEFINED_UNSPLIT_WORDS const unsigned char *key = NULL; // todo : grn_pat_lcp_search if ((tid = grn_sym_common_prefix_search(sym, p))) { if (!(key = _grn_sym_key(sym, tid))) { tokenizer->status = GRN_TOKEN_CURSOR_NOT_FOUND; return NULL; } len = grn_str_len(key, tokenizer->query->encoding, NULL); } r = p + grn_charlen_(ctx, p, e, tokenizer->query->encoding); if (tid && (len > 1 || r == p)) { if (r != p && pos + len - 1 <= tokenizer->tail) { continue; } p += strlen(key); if (!*p && tokenizer->mode == GRN_TOKEN_GET) { tokenizer->status = GRN_TOKEN_CURSOR_DONE; } } #endif /* PRE_DEFINED_UNSPLIT_WORDS */ if ((cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { len++; r += cl; tokenizer->next = r; while (len < tokenizer->ngram_unit && (cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { if (cp) { if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } cp++; if ((tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) || (tokenizer->uni_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) || (tokenizer->uni_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL)) { break; } } len++; r += cl; } if (tokenizer->overlap) { status |= GRN_TOKEN_OVERLAP; } if (len < tokenizer->ngram_unit) { status |= GRN_TOKEN_UNMATURED; } tokenizer->overlap = (len > 1) ? 1 : 0; } } tokenizer->pos = pos; tokenizer->len = len; tokenizer->tail = pos + len - 1; if (p == r || tokenizer->next == e) { tokenizer->skip = 0; status |= GRN_TOKEN_LAST; } else { tokenizer->skip = tokenizer->overlap ? 1 : len; } if (r == e) { status |= GRN_TOKEN_REACH_END; } grn_tokenizer_token_push(ctx, &(tokenizer->token), (const char *)p, r - p, status); return NULL; }
static grn_rc ngram_next(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data) { size_t cl; grn_ngram_tokenizer *token = user_data->ptr; const unsigned char *p = token->next, *r = p, *e = token->end; int32_t len = 0, pos = token->pos + token->skip, status = 0; uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL; if (cp && token->uni_alpha && GRN_STR_CTYPE(*cp) == grn_str_alpha) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; if (GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != grn_str_alpha) { break; } } token->next = r; token->overlap = 0; } else if (cp && token->uni_digit && GRN_STR_CTYPE(*cp) == grn_str_digit) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; if (GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != grn_str_digit) { break; } } token->next = r; token->overlap = 0; } else if (cp && token->uni_symbol && GRN_STR_CTYPE(*cp) == grn_str_symbol) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; if (GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != grn_str_symbol) { break; } } token->next = r; token->overlap = 0; } else { #ifdef PRE_DEFINED_UNSPLIT_WORDS const unsigned char *key = NULL; // todo : grn_pat_lcp_search if ((tid = grn_sym_common_prefix_search(sym, p))) { if (!(key = _grn_sym_key(sym, tid))) { token->status = grn_token_not_found; return GRN_ID_NIL; } len = grn_str_len(key, token->encoding, NULL); } r = p + grn_charlen_(ctx, p, e, token->encoding); if (tid && (len > 1 || r == p)) { if (r != p && pos + len - 1 <= token->tail) { continue; } p += strlen(key); if (!*p && !token->add) { token->status = grn_token_done; } } #endif /* PRE_DEFINED_UNSPLIT_WORDS */ if ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { len++; r += cl; token->next = r; while (len < token->ngram_unit && (cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { if (cp) { if (GRN_STR_ISBLANK(*cp)) { break; } cp++; if ((token->uni_alpha && GRN_STR_CTYPE(*cp) == grn_str_alpha) || (token->uni_digit && GRN_STR_CTYPE(*cp) == grn_str_digit) || (token->uni_symbol && GRN_STR_CTYPE(*cp) == grn_str_symbol)) { break; } } len++; r += cl; } if (token->overlap) { status |= GRN_TOKEN_OVERLAP; } if (len < token->ngram_unit) { status |= GRN_TOKEN_UNMATURED; } token->overlap = 1; } } token->pos = pos; token->len = len; token->tail = pos + len - 1; if (p == r || r == e) { token->skip = 0; status |= GRN_TOKEN_LAST; } else { token->skip = token->overlap ? 1 : len; } GRN_TEXT_SET_REF(&token->curr_, p, r - p); GRN_UINT32_SET(ctx, &token->stat_, status); grn_ctx_push(ctx, &token->curr_); grn_ctx_push(ctx, &token->stat_); return GRN_SUCCESS; }