static grn_rc mecab_next(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data) { size_t cl; grn_mecab_tokenizer *token = user_data->ptr; const unsigned char *p = token->next, *r; const unsigned char *e = token->end; for (r = p; r < e; r += cl) { if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) { token->next = (unsigned char *)e; break; } if (grn_isspace((const char *)r, token->encoding)) { const unsigned char *q = r; while ((cl = grn_isspace((const char *)q, token->encoding))) { q += cl; } token->next = (unsigned char *)q; break; } } GRN_TEXT_SET_REF(&token->curr_, p, r - p); GRN_UINT32_SET(ctx, &token->stat_, r == e ? GRN_TOKEN_LAST : 0); grn_ctx_push(ctx, &token->curr_); grn_ctx_push(ctx, &token->stat_); return GRN_SUCCESS; }
/* This function returns tokens one by one. */ static grn_obj * mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { size_t cl; /* grn_obj *table = args[0]; */ grn_mecab_tokenizer *token = user_data->ptr; char *p = token->next, *r; char *e = token->end; for (r = p; r < e; r += cl) { if (!(cl = grn_charlen_(ctx, r, e, token->encoding))) { token->next = e; break; } if (grn_isspace(r, token->encoding)) { char *q = r; while ((cl = grn_isspace(q, token->encoding))) { q += cl; } token->next = q; break; } } GRN_TEXT_SET_REF(&token->curr_, p, r - p); GRN_UINT32_SET(ctx, &token->stat_, r == e ? GRN_TOKEN_LAST : 0); grn_ctx_push(ctx, &token->curr_); grn_ctx_push(ctx, &token->stat_); return NULL; }
/* This function returns tokens one by one. */ static grn_obj * mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { /* grn_obj *table = args[0]; */ grn_mecab_tokenizer *tokenizer = user_data->ptr; grn_encoding encoding = tokenizer->query->encoding; if (tokenizer->query->have_tokenized_delimiter) { tokenizer->next = grn_tokenizer_tokenized_delimiter_next(ctx, &(tokenizer->token), tokenizer->next, tokenizer->end - tokenizer->next, encoding); } else { size_t cl; const char *p = tokenizer->next, *r; const char *e = tokenizer->end; grn_tokenizer_status status; for (r = p; r < e; r += cl) { int space_len; space_len = grn_isspace(r, encoding); if (space_len > 0 && r == p) { cl = space_len; p = r + cl; continue; } if (!(cl = grn_charlen_(ctx, r, e, encoding))) { tokenizer->next = e; break; } if (space_len > 0) { const char *q = r + space_len; while (q < e && (space_len = grn_isspace(q, encoding))) { q += space_len; } tokenizer->next = q; break; } } if (r == e || tokenizer->next == e) { status = GRN_TOKENIZER_LAST; } else { status = GRN_TOKENIZER_CONTINUE; } grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status); } return NULL; }
inline static grn_cell * get_word(grn_ctx *ctx, grn_query *q, int *prefixp) { char *start = q->cur, *end; unsigned int len; for (end = q->cur;; ) { /* null check and length check */ if (!(len = grn_charlen(ctx, end, q->str_end))) { q->cur = q->str_end; break; } if (grn_isspace(end, q->encoding) || *end == GRN_QUERY_PARENR) { q->cur = end; break; } if (*end == GRN_QUERY_PREFIX) { *prefixp = 1; q->cur = end + 1; break; } end += len; } return token_new(q, start, end); }
inline static void skip_space(grn_ctx *ctx, grn_query *q) { unsigned int len; while (q->cur < q->str_end && grn_isspace(q->cur, q->encoding)) { /* null check and length check */ if (!(len = grn_charlen(ctx, q->cur, q->str_end))) { q->cur = q->str_end; break; } q->cur += len; } }
inline static void get_pragma(grn_ctx *ctx, grn_query *q) { char *start, *end = q->cur; while (end < q->str_end && *end == GRN_QUERY_PREFIX) { if (++end >= q->str_end) { break; } switch (*end) { case 'E' : start = ++end; q->escalation_threshold = grn_atoi(start, q->str_end, (const char **)&end); while (end < q->str_end && (isdigit(*end) || *end == '-')) { end++; } if (*end == ',') { start = ++end; q->escalation_decaystep = grn_atoi(start, q->str_end, (const char **)&end); } q->cur = end; break; case 'D' : start = ++end; while (end < q->str_end && *end != GRN_QUERY_PREFIX && !grn_isspace(end, q->encoding)) { end++; } if (end > start) { switch (*start) { case 'O' : q->default_op = GRN_OP_OR; break; case GRN_QUERY_AND : q->default_op = GRN_OP_AND; break; case GRN_QUERY_BUT : q->default_op = GRN_OP_BUT; break; case GRN_QUERY_ADJ_INC : q->default_op = GRN_OP_ADJUST; break; } } q->cur = end; break; case 'W' : start = ++end; end = (char *)get_weight_vector(ctx, q, start); q->cur = end; break; } } }
static void json_read(grn_ctx *ctx, grn_loader *loader, const char *str, unsigned int str_len) { const char *const beg = str; char c; int len; const char *se = str + str_len; while (str < se) { c = *str; switch (loader->stat) { case GRN_LOADER_BEGIN : if ((len = grn_isspace(str, ctx->encoding))) { str += len; continue; } switch (c) { case '[' : JSON_READ_OPEN_BRACKET(); break; case '{' : JSON_READ_OPEN_BRACE(); break; default : ERR(GRN_INVALID_ARGUMENT, "JSON must start with '[' or '{': <%.*s>", str_len, beg); loader->stat = GRN_LOADER_END; break; } break; case GRN_LOADER_TOKEN : if ((len = grn_isspace(str, ctx->encoding))) { str += len; continue; } switch (c) { case '"' : loader->stat = GRN_LOADER_STRING; values_add(ctx, loader); str++; break; case '[' : JSON_READ_OPEN_BRACKET(); break; case '{' : JSON_READ_OPEN_BRACE(); break; case ':' : str++; break; case ',' : str++; break; case ']' : bracket_close(ctx, loader); loader->stat = GRN_BULK_VSIZE(&loader->level) ? GRN_LOADER_TOKEN : GRN_LOADER_END; if (ctx->rc == GRN_CANCEL) { loader->stat = GRN_LOADER_END; } str++; break; case '}' : brace_close(ctx, loader); loader->stat = GRN_BULK_VSIZE(&loader->level) ? GRN_LOADER_TOKEN : GRN_LOADER_END; if (ctx->rc == GRN_CANCEL) { loader->stat = GRN_LOADER_END; } str++; break; case '+' : case '-' : case '0' : case '1' : case '2' : case '3' : case '4' : case '5' : case '6' : case '7' : case '8' : case '9' : loader->stat = GRN_LOADER_NUMBER; values_add(ctx, loader); break; default : if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || ('_' == c)) { loader->stat = GRN_LOADER_SYMBOL; values_add(ctx, loader); } else { if ((len = grn_charlen(ctx, str, se))) { GRN_LOG(ctx, GRN_LOG_ERROR, "ignored invalid char('%c') at", c); GRN_LOG(ctx, GRN_LOG_ERROR, "%.*s", (int)(str - beg) + len, beg); GRN_LOG(ctx, GRN_LOG_ERROR, "%*s", (int)(str - beg) + 1, "^"); str += len; } else { GRN_LOG(ctx, GRN_LOG_ERROR, "ignored invalid char(\\x%.2x) after", c); GRN_LOG(ctx, GRN_LOG_ERROR, "%.*s", (int)(str - beg), beg); str = se; } } break; } break; case GRN_LOADER_SYMBOL : if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || ('_' == c)) { GRN_TEXT_PUTC(ctx, loader->last, c); str++; } else { char *v = GRN_TEXT_VALUE(loader->last); switch (*v) { case 'n' : if (GRN_TEXT_LEN(loader->last) == 4 && !memcmp(v, "null", 4)) { loader->last->header.domain = GRN_DB_VOID; GRN_BULK_REWIND(loader->last); } break; case 't' : if (GRN_TEXT_LEN(loader->last) == 4 && !memcmp(v, "true", 4)) { loader->last->header.domain = GRN_DB_BOOL; GRN_BOOL_SET(ctx, loader->last, GRN_TRUE); } break; case 'f' : if (GRN_TEXT_LEN(loader->last) == 5 && !memcmp(v, "false", 5)) { loader->last->header.domain = GRN_DB_BOOL; GRN_BOOL_SET(ctx, loader->last, GRN_FALSE); } break; default : break; } loader->stat = GRN_BULK_VSIZE(&loader->level) ? GRN_LOADER_TOKEN : GRN_LOADER_END; } break; case GRN_LOADER_NUMBER : switch (c) { case '+' : case '-' : case '.' : case 'e' : case 'E' : case '0' : case '1' : case '2' : case '3' : case '4' : case '5' : case '6' : case '7' : case '8' : case '9' : GRN_TEXT_PUTC(ctx, loader->last, c); str++; break; default : { const char *cur, *str = GRN_BULK_HEAD(loader->last); const char *str_end = GRN_BULK_CURR(loader->last); int64_t i = grn_atoll(str, str_end, &cur); if (cur == str_end) { loader->last->header.domain = GRN_DB_INT64; GRN_INT64_SET(ctx, loader->last, i); } else if (cur != str) { uint64_t i = grn_atoull(str, str_end, &cur); if (cur == str_end) { loader->last->header.domain = GRN_DB_UINT64; GRN_UINT64_SET(ctx, loader->last, i); } else if (cur != str) { double d; char *end; grn_obj buf; GRN_TEXT_INIT(&buf, 0); GRN_TEXT_PUT(ctx, &buf, str, GRN_BULK_VSIZE(loader->last)); GRN_TEXT_PUTC(ctx, &buf, '\0'); errno = 0; d = strtod(GRN_TEXT_VALUE(&buf), &end); if (!errno && end + 1 == GRN_BULK_CURR(&buf)) { loader->last->header.domain = GRN_DB_FLOAT; GRN_FLOAT_SET(ctx, loader->last, d); } GRN_OBJ_FIN(ctx, &buf); } } } loader->stat = GRN_BULK_VSIZE(&loader->level) ? GRN_LOADER_TOKEN : GRN_LOADER_END; break; } break; case GRN_LOADER_STRING : switch (c) { case '\\' : loader->stat = GRN_LOADER_STRING_ESC; str++; break; case '"' : str++; loader->stat = GRN_BULK_VSIZE(&loader->level) ? GRN_LOADER_TOKEN : GRN_LOADER_END; /* *(GRN_BULK_CURR(loader->last)) = '\0'; GRN_LOG(ctx, GRN_LOG_ALERT, "read str(%s)", GRN_TEXT_VALUE(loader->last)); */ break; default : if ((len = grn_charlen(ctx, str, se))) { GRN_TEXT_PUT(ctx, loader->last, str, len); str += len; } else { GRN_LOG(ctx, GRN_LOG_ERROR, "ignored invalid char(\\x%.2x) after", c); GRN_LOG(ctx, GRN_LOG_ERROR, "%.*s", (int)(str - beg), beg); str = se; } break; } break; case GRN_LOADER_STRING_ESC : switch (c) { case 'b' : GRN_TEXT_PUTC(ctx, loader->last, '\b'); loader->stat = GRN_LOADER_STRING; break; case 'f' : GRN_TEXT_PUTC(ctx, loader->last, '\f'); loader->stat = GRN_LOADER_STRING; break; case 'n' : GRN_TEXT_PUTC(ctx, loader->last, '\n'); loader->stat = GRN_LOADER_STRING; break; case 'r' : GRN_TEXT_PUTC(ctx, loader->last, '\r'); loader->stat = GRN_LOADER_STRING; break; case 't' : GRN_TEXT_PUTC(ctx, loader->last, '\t'); loader->stat = GRN_LOADER_STRING; break; case 'u' : loader->stat = GRN_LOADER_UNICODE0; break; default : GRN_TEXT_PUTC(ctx, loader->last, c); loader->stat = GRN_LOADER_STRING; break; } str++; break; case GRN_LOADER_UNICODE0 : switch (c) { case '0' : case '1' : case '2' : case '3' : case '4' : case '5' : case '6' : case '7' : case '8' : case '9' : loader->unichar = (c - '0') * 0x1000; break; case 'a' : case 'b' : case 'c' : case 'd' : case 'e' : case 'f' : loader->unichar = (c - 'a' + 10) * 0x1000; break; case 'A' : case 'B' : case 'C' : case 'D' : case 'E' : case 'F' : loader->unichar = (c - 'A' + 10) * 0x1000; break; default : ;// todo : error } loader->stat = GRN_LOADER_UNICODE1; str++; break; case GRN_LOADER_UNICODE1 : switch (c) { case '0' : case '1' : case '2' : case '3' : case '4' : case '5' : case '6' : case '7' : case '8' : case '9' : loader->unichar += (c - '0') * 0x100; break; case 'a' : case 'b' : case 'c' : case 'd' : case 'e' : case 'f' : loader->unichar += (c - 'a' + 10) * 0x100; break; case 'A' : case 'B' : case 'C' : case 'D' : case 'E' : case 'F' : loader->unichar += (c - 'A' + 10) * 0x100; break; default : ;// todo : error } loader->stat = GRN_LOADER_UNICODE2; str++; break; case GRN_LOADER_UNICODE2 : switch (c) { case '0' : case '1' : case '2' : case '3' : case '4' : case '5' : case '6' : case '7' : case '8' : case '9' : loader->unichar += (c - '0') * 0x10; break; case 'a' : case 'b' : case 'c' : case 'd' : case 'e' : case 'f' : loader->unichar += (c - 'a' + 10) * 0x10; break; case 'A' : case 'B' : case 'C' : case 'D' : case 'E' : case 'F' : loader->unichar += (c - 'A' + 10) * 0x10; break; default : ;// todo : error } loader->stat = GRN_LOADER_UNICODE3; str++; break; case GRN_LOADER_UNICODE3 : switch (c) { case '0' : case '1' : case '2' : case '3' : case '4' : case '5' : case '6' : case '7' : case '8' : case '9' : loader->unichar += (c - '0'); break; case 'a' : case 'b' : case 'c' : case 'd' : case 'e' : case 'f' : loader->unichar += (c - 'a' + 10); break; case 'A' : case 'B' : case 'C' : case 'D' : case 'E' : case 'F' : loader->unichar += (c - 'A' + 10); break; default : ;// todo : error } { uint32_t u = loader->unichar; if (u >= 0xd800 && u <= 0xdbff) { /* High-surrogate code points */ loader->unichar_hi = u; loader->stat = GRN_LOADER_STRING; str++; break; } if (u >= 0xdc00 && u <= 0xdfff) { /* Low-surrogate code points */ u = 0x10000 + (loader->unichar_hi - 0xd800) * 0x400 + u - 0xdc00; } if (u < 0x80) { GRN_TEXT_PUTC(ctx, loader->last, u); } else { if (u < 0x800) { GRN_TEXT_PUTC(ctx, loader->last, (u >> 6) | 0xc0); } else { if (u < 0x10000) { GRN_TEXT_PUTC(ctx, loader->last, (u >> 12) | 0xe0); } else { GRN_TEXT_PUTC(ctx, loader->last, (u >> 18) | 0xf0); GRN_TEXT_PUTC(ctx, loader->last, ((u >> 12) & 0x3f) | 0x80); } GRN_TEXT_PUTC(ctx, loader->last, ((u >> 6) & 0x3f) | 0x80); } GRN_TEXT_PUTC(ctx, loader->last, (u & 0x3f) | 0x80); }
static grn_bool chunked_tokenize_utf8(grn_ctx *ctx, grn_mecab_tokenizer *tokenizer, const char *string, unsigned int string_bytes) { const char *chunk_start; const char *current; const char *last_delimiter; const char *string_end = string + string_bytes; grn_encoding encoding = tokenizer->query->encoding; if (string_bytes < grn_mecab_chunk_size_threshold) { return chunked_tokenize_utf8_chunk(ctx, tokenizer, string, string_bytes); } chunk_start = current = string; last_delimiter = NULL; while (current < string_end) { int space_bytes; int character_bytes; const char *current_character; space_bytes = grn_isspace(current, encoding); if (space_bytes > 0) { if (chunk_start != current) { grn_bool succeeded; succeeded = chunked_tokenize_utf8_chunk(ctx, tokenizer, chunk_start, current - chunk_start); if (!succeeded) { return succeeded; } } current += space_bytes; chunk_start = current; last_delimiter = NULL; continue; } character_bytes = grn_charlen_(ctx, current, string_end, encoding); if (character_bytes == 0) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab][chunk] " "invalid byte sequence: position=%d", (int)(current - string)); return GRN_FALSE; } current_character = current; current += character_bytes; if (is_delimiter_character(ctx, current_character, character_bytes)) { last_delimiter = current; } if ((current - chunk_start) >= grn_mecab_chunk_size_threshold) { grn_bool succeeded; if (last_delimiter) { succeeded = chunked_tokenize_utf8_chunk(ctx, tokenizer, chunk_start, last_delimiter - chunk_start); chunk_start = last_delimiter; } else { succeeded = chunked_tokenize_utf8_chunk(ctx, tokenizer, chunk_start, current - chunk_start); chunk_start = current; } if (!succeeded) { return succeeded; } last_delimiter = NULL; } } if (current == chunk_start) { return GRN_TRUE; } else { return chunked_tokenize_utf8_chunk(ctx, tokenizer, chunk_start, current - chunk_start); } }