static grn_obj * command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_obj *tokenizer_name; grn_obj *string; grn_obj *normalizer_name; grn_obj *flag_names; grn_obj *mode_name; grn_obj *token_filter_names; tokenizer_name = grn_plugin_proc_get_var(ctx, user_data, "tokenizer", -1); string = grn_plugin_proc_get_var(ctx, user_data, "string", -1); normalizer_name = grn_plugin_proc_get_var(ctx, user_data, "normalizer", -1); flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1); mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode", -1); token_filter_names = grn_plugin_proc_get_var(ctx, user_data, "token_filters", -1); if (GRN_TEXT_LEN(tokenizer_name) == 0) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] tokenizer name is missing"); return NULL; } if (GRN_TEXT_LEN(string) == 0) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] string is missing"); return NULL; } { unsigned int flags; grn_obj *lexicon; flags = parse_tokenize_flags(ctx, flag_names); if (ctx->rc != GRN_SUCCESS) { return NULL; } lexicon = create_lexicon_for_tokenize(ctx, tokenizer_name, normalizer_name, token_filter_names); if (!lexicon) { return NULL; } #define MODE_NAME_EQUAL(name)\ (GRN_TEXT_LEN(mode_name) == strlen(name) &&\ memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0) { grn_obj tokens; GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD")) { tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); output_tokens(ctx, &tokens, lexicon, NULL); } else if (MODE_NAME_EQUAL("GET")) { tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); GRN_BULK_REWIND(&tokens); tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens); output_tokens(ctx, &tokens, lexicon, NULL); } else { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] invalid mode: <%.*s>", (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name)); } GRN_OBJ_FIN(ctx, &tokens); } #undef MODE_NAME_EQUAL grn_obj_unlink(ctx, lexicon); } return NULL; }
static grn_obj * command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_raw_string tokenizer_raw; grn_raw_string string_raw; grn_raw_string normalizer_raw; grn_raw_string flags_raw; grn_raw_string mode_raw; grn_raw_string token_filters_raw; #define GET_VALUE(name) \ name ## _raw.value = \ grn_plugin_proc_get_var_string(ctx, \ user_data, \ #name, \ strlen(#name), \ &(name ## _raw.length)) GET_VALUE(tokenizer); GET_VALUE(string); GET_VALUE(normalizer); GET_VALUE(flags); GET_VALUE(mode); GET_VALUE(token_filters); #undef GET_VALUE if (tokenizer_raw.length == 0) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] tokenizer name is missing"); return NULL; } if (string_raw.length == 0) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] string is missing"); return NULL; } { unsigned int flags; grn_obj *lexicon; flags = parse_tokenize_flags(ctx, &flags_raw); if (ctx->rc != GRN_SUCCESS) { return NULL; } lexicon = create_lexicon_for_tokenize(ctx, &tokenizer_raw, &normalizer_raw, &token_filters_raw); if (!lexicon) { return NULL; } { grn_obj tokens; GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); if (mode_raw.length == 0 || GRN_RAW_STRING_EQUAL_CSTRING(mode_raw, "ADD")) { tokenize(ctx, lexicon, &string_raw, GRN_TOKEN_ADD, flags, &tokens); output_tokens(ctx, &tokens, lexicon, NULL); } else if (GRN_RAW_STRING_EQUAL_CSTRING(mode_raw, "GET")) { tokenize(ctx, lexicon, &string_raw, GRN_TOKEN_ADD, flags, &tokens); GRN_BULK_REWIND(&tokens); tokenize(ctx, lexicon, &string_raw, GRN_TOKEN_GET, flags, &tokens); output_tokens(ctx, &tokens, lexicon, NULL); } else { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] invalid mode: <%.*s>", (int)mode_raw.length, mode_raw.value); } GRN_OBJ_FIN(ctx, &tokens); } #undef MODE_NAME_EQUAL grn_obj_unlink(ctx, lexicon); } return NULL; }
static grn_obj * command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_obj *table_name; grn_obj *string; grn_obj *flag_names; grn_obj *mode_name; grn_obj *index_column_name; table_name = grn_plugin_proc_get_var(ctx, user_data, "table", -1); string = grn_plugin_proc_get_var(ctx, user_data, "string", -1); flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1); mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode", -1); index_column_name = grn_plugin_proc_get_var(ctx, user_data, "index_column", -1); if (GRN_TEXT_LEN(table_name) == 0) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] table name is missing"); return NULL; } if (GRN_TEXT_LEN(string) == 0) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] string is missing"); return NULL; } { unsigned int flags; grn_obj *lexicon; grn_obj *index_column = NULL; flags = parse_tokenize_flags(ctx, flag_names); if (ctx->rc != GRN_SUCCESS) { return NULL; } lexicon = grn_ctx_get(ctx, GRN_TEXT_VALUE(table_name), GRN_TEXT_LEN(table_name)); if (!lexicon) { return NULL; } #define MODE_NAME_EQUAL(name)\ (GRN_TEXT_LEN(mode_name) == strlen(name) &&\ memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0) if (GRN_TEXT_LEN(index_column_name) > 0) { index_column = grn_obj_column(ctx, lexicon, GRN_TEXT_VALUE(index_column_name), GRN_TEXT_LEN(index_column_name)); if (!index_column) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] nonexistent index column: <%.*s>", (int)GRN_TEXT_LEN(index_column_name), GRN_TEXT_VALUE(index_column_name)); goto exit; } if (index_column->header.type != GRN_COLUMN_INDEX) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] index column must be COLUMN_INDEX: <%.*s>", (int)GRN_TEXT_LEN(index_column_name), GRN_TEXT_VALUE(index_column_name)); goto exit; } } { grn_obj tokens; GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("GET")) { tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens); output_tokens(ctx, &tokens, lexicon, index_column); } else if (MODE_NAME_EQUAL("ADD")) { tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); output_tokens(ctx, &tokens, lexicon, index_column); } else { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] invalid mode: <%.*s>", (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name)); } GRN_OBJ_FIN(ctx, &tokens); } #undef MODE_NAME_EQUAL exit: grn_obj_unlink(ctx, lexicon); if (index_column) { grn_obj_unlink(ctx, index_column); } } return NULL; }
static grn_obj * command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_raw_string table_raw; grn_raw_string string_raw; grn_raw_string flags_raw; grn_raw_string mode_raw; grn_raw_string index_column_raw; #define GET_VALUE(name) \ name ## _raw.value = \ grn_plugin_proc_get_var_string(ctx, \ user_data, \ #name, \ strlen(#name), \ &(name ## _raw.length)) GET_VALUE(table); GET_VALUE(string); GET_VALUE(flags); GET_VALUE(mode); GET_VALUE(index_column); #undef GET_VALUE if (table_raw.length == 0) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] table name is missing"); return NULL; } if (string_raw.length == 0) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] string is missing"); return NULL; } { unsigned int flags; grn_obj *lexicon; grn_obj *index_column = NULL; flags = parse_tokenize_flags(ctx, &flags_raw); if (ctx->rc != GRN_SUCCESS) { return NULL; } lexicon = grn_ctx_get(ctx, table_raw.value, table_raw.length); if (!lexicon) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] nonexistent lexicon: <%.*s>", (int)table_raw.length, table_raw.value); return NULL; } if (index_column_raw.length > 0) { index_column = grn_obj_column(ctx, lexicon, index_column_raw.value, index_column_raw.length); if (!index_column) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] nonexistent index column: <%.*s>", (int)index_column_raw.length, index_column_raw.value); goto exit; } if (index_column->header.type != GRN_COLUMN_INDEX) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] " "index column must be COLUMN_INDEX: <%.*s>", (int)index_column_raw.length, index_column_raw.value); goto exit; } } { grn_obj tokens; GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); if (mode_raw.length == 0 || GRN_RAW_STRING_EQUAL_CSTRING(mode_raw, "GET")) { tokenize(ctx, lexicon, &string_raw, GRN_TOKEN_GET, flags, &tokens); output_tokens(ctx, &tokens, lexicon, index_column); } else if (GRN_RAW_STRING_EQUAL_CSTRING(mode_raw, "ADD")) { tokenize(ctx, lexicon, &string_raw, GRN_TOKEN_ADD, flags, &tokens); output_tokens(ctx, &tokens, lexicon, index_column); } else { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] invalid mode: <%.*s>", (int)mode_raw.length, mode_raw.value); } GRN_OBJ_FIN(ctx, &tokens); } #undef MODE_NAME_EQUAL exit: grn_obj_unlink(ctx, lexicon); if (index_column) { grn_obj_unlink(ctx, index_column); } } return NULL; }