static grn_obj * delimited_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t *delimiter, uint32_t delimiter_len) { grn_obj *str; int nflags = 0; grn_delimited_tokenizer *token; grn_obj_flags table_flags; if (!(str = grn_ctx_pop(ctx))) { ERR(GRN_INVALID_ARGUMENT, "missing argument"); return NULL; } if (!(token = GRN_MALLOC(sizeof(grn_delimited_tokenizer)))) { return NULL; } user_data->ptr = token; token->delimiter = delimiter; token->delimiter_len = delimiter_len; token->pos = 0; grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL); nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE); if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str), nflags, token->encoding))) { GRN_FREE(token); ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open"); return NULL; } token->next = (unsigned char *)token->nstr->norm; token->end = token->next + token->nstr->norm_blen; token->len = token->nstr->length; GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY); GRN_UINT32_INIT(&token->stat_, 0); return NULL; }
static grn_rc ngram_init(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data, uint8_t ngram_unit) { grn_obj *str; int nflags = GRN_STR_REMOVEBLANK|GRN_STR_WITH_CTYPES; grn_ngram_tokenizer *token; grn_obj_flags table_flags; if (!(str = grn_ctx_pop(ctx))) { return GRN_INVALID_ARGUMENT; } if (!(token = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) { return ctx->rc; } user_data->ptr = token; token->uni_alpha = 1; token->uni_digit = 1; token->uni_symbol = 1; token->ngram_unit = ngram_unit; token->overlap = 0; token->pos = 0; token->skip = 0; grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL); nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE); if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str), nflags, token->encoding))) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_str_open failed at grn_token_open"); return GRN_TOKENIZER_ERROR; } token->next = (unsigned char *)token->nstr->norm; token->end = token->next + token->nstr->norm_blen; token->ctypes = token->nstr->ctypes; token->len = token->nstr->length; GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY); GRN_UINT32_INIT(&token->stat_, 0); return GRN_SUCCESS; }
static grn_obj * ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram_unit, uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank) { grn_obj *str; int nflags = GRN_STR_REMOVEBLANK|GRN_STR_WITH_CTYPES; grn_ngram_tokenizer *token; grn_obj_flags table_flags; if (!(str = grn_ctx_pop(ctx))) { ERR(GRN_INVALID_ARGUMENT, "missing argument"); return NULL; } if (!(token = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) { return NULL; } user_data->ptr = token; token->uni_alpha = uni_alpha; token->uni_digit = uni_digit; token->uni_symbol = uni_symbol; token->ngram_unit = ngram_unit; token->ignore_blank = ignore_blank; token->overlap = 0; token->pos = 0; token->skip = 0; grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL); nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE); if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str), nflags, token->encoding))) { GRN_FREE(token); ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open"); return NULL; } token->next = (unsigned char *)token->nstr->norm; token->end = token->next + token->nstr->norm_blen; token->ctypes = token->nstr->ctypes; token->len = token->nstr->length; GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY); GRN_UINT32_INIT(&token->stat_, 0); return NULL; }
static grn_rc mecab_init(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data) { grn_obj *str; int nflags = 0; char *buf, *s, *p; char mecab_err[256]; grn_obj_flags table_flags; grn_mecab_tokenizer *token; unsigned int bufsize, maxtrial = 10, len; if (!(str = grn_ctx_pop(ctx))) { return GRN_INVALID_ARGUMENT; } SOLE_MECAB_CONFIRM; if (!sole_mecab) { GRN_LOG(ctx, GRN_LOG_ALERT, "mecab_new failed on grn_mecab_init"); return GRN_TOKENIZER_ERROR; } if (!(token = GRN_MALLOC(sizeof(grn_mecab_tokenizer)))) { return ctx->rc; } user_data->ptr = token; token->mecab = sole_mecab; // if (!(token->mecab = mecab_new3())) { grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL); nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE); if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str), nflags, token->encoding))) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_str_open failed at grn_token_open"); return GRN_TOKENIZER_ERROR; } len = token->nstr->norm_blen; mecab_err[sizeof(mecab_err) - 1] = '\0'; for (bufsize = len * 2 + 1; maxtrial; bufsize *= 2, maxtrial--) { if(!(buf = GRN_MALLOC(bufsize + 1))) { GRN_LOG(ctx, GRN_LOG_ALERT, "buffer allocation on mecab_init failed !"); GRN_FREE(token); return ctx->rc; } MUTEX_LOCK(sole_mecab_lock); s = mecab_sparse_tostr3(token->mecab, token->nstr->norm, len, buf, bufsize); if (!s) { strncpy(mecab_err, mecab_strerror(token->mecab), sizeof(mecab_err) - 1); } MUTEX_UNLOCK(sole_mecab_lock); if (s) { break; } GRN_FREE(buf); if (strstr(mecab_err, "output buffer overflow") == NULL) { break; } } if (!s) { GRN_LOG(ctx, GRN_LOG_ALERT, "mecab_sparse_tostr failed len=%d bufsize=%d err=%s", len, bufsize, mecab_err); GRN_FREE(token); return GRN_TOKENIZER_ERROR; } // certain version of mecab returns trailing lf or spaces. for (p = buf + strlen(buf) - 1; buf <= p && (*p == '\n' || isspace(*(unsigned char *)p)); p--) { *p = '\0'; } //grn_log("sparsed='%s'", s); token->buf = (unsigned char *)buf; token->next = (unsigned char *)buf; token->end = (unsigned char *)buf + strlen(buf); GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY); GRN_UINT32_INIT(&token->stat_, 0); return GRN_SUCCESS; }
/* This function is called for a full text search query or a document to be indexed. This means that both short/long strings are given. The return value of this function is ignored. When an error occurs in this function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). */ static grn_obj * mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_obj *str; int nflags = 0; char *buf, *p; const char *s; grn_obj *table = args[0]; grn_obj_flags table_flags; grn_encoding table_encoding; grn_mecab_tokenizer *token; unsigned int bufsize, len; if (!(str = grn_ctx_pop(ctx))) { ERR(GRN_INVALID_ARGUMENT, "missing argument"); return NULL; } if (!sole_mecab) { CRITICAL_SECTION_ENTER(sole_mecab_lock); if (!sole_mecab) { sole_mecab = mecab_new2("-Owakati"); if (!sole_mecab) { ERR(GRN_TOKENIZER_ERROR, "mecab_new2 failed on grn_mecab_init: %s", mecab_strerror(NULL)); } else { sole_mecab_encoding = get_mecab_encoding(sole_mecab); } } CRITICAL_SECTION_LEAVE(sole_mecab_lock); } if (!sole_mecab) { return NULL; } grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL); if (table_encoding != sole_mecab_encoding) { ERR(GRN_TOKENIZER_ERROR, "MeCab dictionary charset (%s) does not match the context encoding: <%s>", grn_enctostr(sole_mecab_encoding), grn_enctostr(table_encoding)); return NULL; } if (!(token = GRN_MALLOC(sizeof(grn_mecab_tokenizer)))) { return NULL; } token->mecab = sole_mecab; token->encoding = table_encoding; nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE); if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str), nflags, token->encoding))) { GRN_FREE(token); ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open"); return NULL; } len = token->nstr->norm_blen; CRITICAL_SECTION_ENTER(sole_mecab_lock); s = mecab_sparse_tostr2(token->mecab, token->nstr->norm, len); if (!s) { ERR(GRN_TOKENIZER_ERROR, "mecab_sparse_tostr failed len=%d err=%s", len, mecab_strerror(token->mecab)); } else { bufsize = strlen(s) + 1; if (!(buf = GRN_MALLOC(bufsize))) { GRN_LOG(ctx, GRN_LOG_ALERT, "buffer allocation on mecab_init failed !"); } else { memcpy(buf, s, bufsize); } } CRITICAL_SECTION_LEAVE(sole_mecab_lock); if (!s || !buf) { grn_str_close(ctx, token->nstr); GRN_FREE(token); return NULL; } /* A certain version of mecab returns trailing lf or spaces. */ for (p = buf + bufsize - 2; buf <= p && isspace(*(unsigned char *)p); p--) { *p = '\0'; } user_data->ptr = token; token->buf = buf; token->next = buf; token->end = p + 1; GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY); GRN_UINT32_INIT(&token->stat_, 0); return NULL; }