grn_bool grn_tokenizer_have_tokenized_delimiter(grn_ctx *ctx, const char *str_ptr, unsigned int str_length, grn_encoding encoding) { int char_length; const char *current = str_ptr; const char *end = str_ptr + str_length; if (encoding != GRN_ENC_UTF8) { return GRN_FALSE; } if (str_length == 0) { return GRN_FALSE; } while ((char_length = grn_charlen_(ctx, current, end, encoding)) > 0) { if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length, encoding)) { return GRN_TRUE; } current += char_length; } return GRN_FALSE; }
const char * grn_tokenizer_next_by_tokenized_delimiter(grn_ctx *ctx, grn_token *token, const char *str_ptr, unsigned int str_length, grn_encoding encoding) { size_t char_length = 0; const char *start = str_ptr; const char *current; const char *end = str_ptr + str_length; const char *next_start = NULL; for (current = start; current < end; current += char_length) { char_length = grn_charlen_(ctx, current, end, encoding); if (char_length == 0) { break; } if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length, encoding)) { next_start = str_ptr + (current - start + char_length); break; } } grn_token_set_data(ctx, token, start, current - start); if (current == end) { grn_token_set_status(ctx, token, GRN_TOKEN_LAST); } else { grn_token_set_status(ctx, token, GRN_TOKEN_CONTINUE); } return next_start; }
const char * grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx, grn_tokenizer_token *token, const char *str_ptr, unsigned int str_length, grn_encoding encoding) { size_t char_length = 0; const char *start = str_ptr; const char *current; const char *end = str_ptr + str_length; const char *next_start = NULL; unsigned int token_length; grn_token_status status; for (current = start; current < end; current += char_length) { char_length = grn_charlen_(ctx, current, end, encoding); if (char_length == 0) { break; } if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length, encoding)) { next_start = str_ptr + (current - start + char_length); break; } } token_length = current - start; if (current == end) { status = GRN_TOKENIZER_LAST; } else { status = GRN_TOKENIZER_CONTINUE; } grn_tokenizer_token_push(ctx, token, start, token_length, status); return next_start; }
static grn_string * grn_fake_string_open(grn_ctx *ctx, grn_string *string) { /* TODO: support GRN_STRING_REMOVE_BLANK flag and ctypes */ grn_string *nstr = string; const char *str; unsigned int str_len; str = nstr->original; str_len = nstr->original_length_in_bytes; if (!(nstr->normalized = GRN_MALLOC(str_len + 1))) { ERR(GRN_NO_MEMORY_AVAILABLE, "[strinig][fake] failed to allocate normalized text space"); grn_string_close(ctx, (grn_obj *)nstr); return NULL; } if (nstr->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER && ctx->encoding == GRN_ENC_UTF8) { int char_length; const char *source_current = str; const char *source_end = str + str_len; char *destination = nstr->normalized; unsigned int destination_length = 0; while ((char_length = grn_charlen(ctx, source_current, source_end)) > 0) { if (!grn_tokenizer_is_tokenized_delimiter(ctx, source_current, char_length, ctx->encoding)) { memcpy(destination, source_current, char_length); destination += char_length; destination_length += char_length; } source_current += char_length; } nstr->normalized[destination_length] = '\0'; nstr->normalized_length_in_bytes = destination_length; } else { memcpy(nstr->normalized, str, str_len); nstr->normalized[str_len] = '\0'; nstr->normalized_length_in_bytes = str_len; } if (nstr->flags & GRN_STRING_WITH_CHECKS) { int16_t f = 0; unsigned char c; size_t i; if (!(nstr->checks = (int16_t *) GRN_MALLOC(sizeof(int16_t) * str_len))) { grn_string_close(ctx, (grn_obj *)nstr); ERR(GRN_NO_MEMORY_AVAILABLE, "[strinig][fake] failed to allocate checks space"); return NULL; } switch (nstr->encoding) { case GRN_ENC_EUC_JP: for (i = 0; i < str_len; i++) { if (!f) { c = (unsigned char) str[i]; f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1) ); nstr->checks[i] = f; } else { nstr->checks[i] = 0; } f--; } break; case GRN_ENC_SJIS: for (i = 0; i < str_len; i++) { if (!f) { c = (unsigned char) str[i]; f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1); nstr->checks[i] = f; } else { nstr->checks[i] = 0; } f--; } break; case GRN_ENC_UTF8: for (i = 0; i < str_len; i++) { if (!f) { c = (unsigned char) str[i]; f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3) : 2) : 1); nstr->checks[i] = f; } else { nstr->checks[i] = 0; } f--; } break; default: for (i = 0; i < str_len; i++) { nstr->checks[i] = 1; } break; } } return nstr; }