/* * call-seq: * patricia_trie.scan(string) -> Array * patricia_trie.scan(string) {|record, word, start, length| ... } * * _string_を走査し、_patricia_trie_内に格納されているキーに * マッチした部分文字列の情報をブロックに渡す。複数のキーが * マッチする場合は最長一致するキーを優先する。 * * [_record_] * マッチしたキーのGroonga::Record。 * * [_word_] * マッチした部分文字列。 * * [_start_] * _string_内での_word_の出現位置。(バイト単位) * * [_length_] * _word_の長さ。(バイト単位) * * ブロックを指定しない場合は、マッチした部分文字列の情報を * まとめて配列として返す。 * * words = Groonga::PatriciaTrie.create(:key_type => "ShortText", * :key_normalize => true) * words.add("リンク") * adventure_of_link = words.add('リンクの冒険') * words.add('冒険') * gaxtu = words.add('ガッ') * muteki = words.add('MUTEKI') * * text = 'muTEki リンクの冒険 ミリバール ガッ' * words.scan(text).each do |record, word, start, length| * p [record.key, word, start, length] * # -> ["MUTEKI", "muTEki", 0, 6] * # -> ["リンクの冒険", "リンクの冒険", 7, 18] * # -> ["ガッ", "ガッ", 42, 6] * end * * words.scan(text) * # -> [[muteki, "muTEki", 0, 6], * # [adventure_of_link, "リンクの冒険", 7, 18], * # [gaxtu, "ガッ", 42, 6]] */ static VALUE rb_grn_patricia_trie_scan (VALUE self, VALUE rb_string) { grn_ctx *context; grn_obj *table; VALUE rb_result = Qnil; grn_pat_scan_hit hits[1024]; const char *string; long string_length; grn_bool block_given; string = StringValuePtr(rb_string); string_length = RSTRING_LEN(rb_string); rb_grn_table_key_support_deconstruct(SELF(self), &table, &context, NULL, NULL, NULL, NULL, NULL, NULL, NULL); block_given = rb_block_given_p(); if (!block_given) rb_result = rb_ary_new(); while (string_length > 0) { const char *rest; int i, n_hits; unsigned int previous_offset = 0; n_hits = grn_pat_scan(context, (grn_pat *)table, string, string_length, hits, sizeof(hits) / sizeof(*hits), &rest); for (i = 0; i < n_hits; i++) { VALUE record, term, matched_info; if (hits[i].offset < previous_offset) continue; record = rb_grn_record_new(self, hits[i].id, Qnil); term = rb_grn_context_rb_string_new(context, string + hits[i].offset, hits[i].length); matched_info = rb_ary_new3(4, record, term, UINT2NUM(hits[i].offset), UINT2NUM(hits[i].length)); if (block_given) { rb_yield(matched_info); } else { rb_ary_push(rb_result, matched_info); } previous_offset = hits[i].offset; } string_length -= rest - string; string = rest; } return rb_result; }
static grn_rc grn_pat_tag_keys(grn_ctx *ctx, grn_obj *keywords, const char *string, unsigned int string_length, const char **open_tags, unsigned int *open_tag_lengths, const char **close_tags, unsigned int *close_tag_lengths, unsigned int n_tags, grn_obj *highlighted, grn_bool use_html_escape) { while (string_length > 0) { #define MAX_N_HITS 16 grn_pat_scan_hit hits[MAX_N_HITS]; const char *rest; unsigned int i, n_hits; unsigned int previous = 0; size_t chunk_length; n_hits = grn_pat_scan(ctx, (grn_pat *)keywords, string, string_length, hits, MAX_N_HITS, &rest); for (i = 0; i < n_hits; i++) { unsigned int nth_tag; if (hits[i].offset - previous > 0) { grn_pat_tag_keys_put_original_text(ctx, highlighted, string + previous, hits[i].offset - previous, use_html_escape); } nth_tag = ((hits[i].id - 1) % n_tags); GRN_TEXT_PUT(ctx, highlighted, open_tags[nth_tag], open_tag_lengths[nth_tag]); grn_pat_tag_keys_put_original_text(ctx, highlighted, string + hits[i].offset, hits[i].length, use_html_escape); GRN_TEXT_PUT(ctx, highlighted, close_tags[nth_tag], close_tag_lengths[nth_tag]); previous = hits[i].offset + hits[i].length; } chunk_length = rest - string; if (chunk_length - previous > 0) { grn_pat_tag_keys_put_original_text(ctx, highlighted, string + previous, string_length - previous, use_html_escape); } string_length -= chunk_length; string = rest; #undef MAX_N_HITS } return GRN_SUCCESS; }
static grn_obj * yangram_next(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args, grn_user_data *user_data) { grn_yangram_tokenizer *tokenizer = user_data->ptr; const unsigned char *string_end = tokenizer->end; const unsigned char *token_top = tokenizer->next; const unsigned char *token_next = token_top; const unsigned char *token_tail = token_top; int token_size = 0; grn_bool is_token_grouped = GRN_FALSE; const unsigned char *token_ctypes = NULL; unsigned int ctypes_skip_size; int char_length = 0; grn_tokenizer_status status = 0; grn_bool is_token_hit = GRN_FALSE; grn_obj *lexicon = args[0]; if (tokenizer->phrase_table) { if (tokenizer->nhits > 0 && token_top - (const unsigned char *)tokenizer->scan_start > tokenizer->hits[tokenizer->current_hit].offset) { tokenizer->current_hit++; } if (tokenizer->current_hit >= tokenizer->nhits) { tokenizer->scan_start = tokenizer->scan_rest; unsigned int scan_rest_length = tokenizer->end - (const unsigned char *)tokenizer->scan_rest; if (scan_rest_length > 0) { tokenizer->nhits = grn_pat_scan(ctx, (grn_pat *)tokenizer->phrase_table, tokenizer->scan_rest, scan_rest_length, tokenizer->hits, MAX_N_HITS, &(tokenizer->scan_rest)); tokenizer->current_hit = 0; } } if (tokenizer->nhits > 0 && tokenizer->current_hit < tokenizer->nhits && token_top - (const unsigned char *)tokenizer->scan_start == tokenizer->hits[tokenizer->current_hit].offset) { is_token_hit = GRN_TRUE; } } if (tokenizer->ctypes) { token_ctypes = tokenizer->ctypes + tokenizer->ctypes_next; } else { token_ctypes = NULL; } if (is_token_hit) { token_size = forward_scan_hit_token_tail(ctx, tokenizer, &token_tail, tokenizer->hits[tokenizer->current_hit].length); token_next = token_tail; tokenizer->current_hit++; } else { is_token_grouped = is_token_group(tokenizer, token_ctypes); if (is_token_grouped) { token_size = forward_grouped_token_tail(ctx, tokenizer, token_ctypes, &token_tail); token_next = token_tail; } else { token_size = forward_ngram_token_tail(ctx, tokenizer, token_ctypes, &token_tail); char_length = grn_plugin_charlen(ctx, (char *)token_next, tokenizer->rest_length, tokenizer->query->encoding); token_next += char_length; } } if (token_top == token_tail || token_next == string_end) { ctypes_skip_size = 0; } else { if (is_token_grouped || is_token_hit) { ctypes_skip_size = token_size; } else { ctypes_skip_size = 1; } } if (tokenizer->use_vgram > 0 && !is_token_grouped) { grn_bool maybe_vgram = GRN_FALSE; grn_id id; id = grn_table_get(ctx, tokenizer->vgram_table, (const char *)token_top, token_tail - token_top); if (id) { maybe_vgram = GRN_TRUE; } if (tokenizer->use_vgram >= VGRAM_BOTH && !maybe_vgram) { if (token_tail < string_end && !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) { grn_id id; const unsigned char *token_next_tail; char_length = grn_plugin_charlen(ctx, (char *)token_tail, tokenizer->rest_length, tokenizer->query->encoding); token_next_tail = token_tail + char_length; id = grn_table_get(ctx, tokenizer->vgram_table, (const char *)token_next, token_next_tail - token_next); if (id) { maybe_vgram = GRN_TRUE; } } else if (token_tail == string_end && tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) { maybe_vgram = GRN_TRUE; } } if (maybe_vgram) { if (token_tail < string_end && !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) { char_length = grn_plugin_charlen(ctx, (char *)token_tail, tokenizer->rest_length, tokenizer->query->encoding); token_size++; token_tail += char_length; if (tokenizer->use_vgram == VGRAM_QUAD) { if (token_tail < string_end && !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) { id = grn_table_get(ctx, tokenizer->vgram_table, (const char *)token_top, token_tail - token_top); if (id) { char_length = grn_plugin_charlen(ctx, (char *)token_tail, tokenizer->rest_length, tokenizer->query->encoding); token_size++; token_tail += char_length; } } else { if (tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) { grn_id tid; tid = grn_table_get(ctx, lexicon, (const char *)token_top, token_tail - token_top); if (tid == GRN_ID_NIL) { int added; grn_table_add(ctx, lexicon, (const char *)token_top, token_tail - token_top, &added); } status |= GRN_TOKEN_FORCE_PREFIX; } } } } else { if (tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) { grn_id tid; tid = grn_table_get(ctx, lexicon, (const char *)token_top, token_tail - token_top); if (tid == GRN_ID_NIL) { int added; grn_table_add(ctx, lexicon, (const char *)token_top, token_tail - token_top, &added); } status |= GRN_TOKEN_FORCE_PREFIX; } } } } if (token_top == token_tail || token_next == string_end) { status |= GRN_TOKEN_LAST; } if (token_tail == string_end) { status |= GRN_TOKEN_REACH_END; } if (!is_token_grouped && !is_token_hit && token_size < tokenizer->ngram_unit) { status |= GRN_TOKEN_UNMATURED; } if (tokenizer->pushed_token_tail && token_top < tokenizer->pushed_token_tail) { status |= GRN_TOKEN_OVERLAP; if (tokenizer->skip_overlap && !grn_ii_overlap_token_skip_enable && !(status & GRN_TOKEN_REACH_END) && !(status & GRN_TOKEN_SKIP_WITH_POSITION) && tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) { if (token_tail <= tokenizer->pushed_token_tail) { status |= GRN_TOKEN_SKIP; } else { if (!is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) { status |= GRN_TOKEN_SKIP; } } } } if (!(status & GRN_TOKEN_SKIP) && !(status & GRN_TOKEN_SKIP_WITH_POSITION)) { tokenizer->pushed_token_tail = token_tail; } tokenizer->next = token_next; tokenizer->rest_length = string_end - token_next; tokenizer->ctypes_next = tokenizer->ctypes_next + ctypes_skip_size; grn_tokenizer_token_push(ctx, &(tokenizer->token), (const char *)token_top, token_tail - token_top, status); return NULL; }