예제 #1
0
/*
 * call-seq:
 *   patricia_trie.scan(string) -> Array
 *   patricia_trie.scan(string) {|record, word, start, length| ... }
 *
 * _string_を走査し、_patricia_trie_内に格納されているキーに
 * マッチした部分文字列の情報をブロックに渡す。複数のキーが
 * マッチする場合は最長一致するキーを優先する。
 *
 * [_record_]
 *   マッチしたキーのGroonga::Record。
 *
 * [_word_]
 *   マッチした部分文字列。
 *
 * [_start_]
 *   _string_内での_word_の出現位置。(バイト単位)
 *
 * [_length_]
 *   _word_の長さ。(バイト単位)
 *
 * ブロックを指定しない場合は、マッチした部分文字列の情報を
 * まとめて配列として返す。
 *
 *   words = Groonga::PatriciaTrie.create(:key_type => "ShortText",
 *                                        :key_normalize => true)
 *   words.add("リンク")
 *   adventure_of_link = words.add('リンクの冒険')
 *   words.add('冒険')
 *   gaxtu = words.add('ガッ')
 *   muteki = words.add('MUTEKI')
 *
 *   text = 'muTEki リンクの冒険 ミリバール ガッ'
 *   words.scan(text).each do |record, word, start, length|
 *     p [record.key, word, start, length]
 *       # -> ["MUTEKI", "muTEki", 0, 6]
 *       # -> ["リンクの冒険", "リンクの冒険", 7, 18]
 *       # -> ["ガッ", "ガッ", 42, 6]
 *   end
 *
 *   words.scan(text)
 *     # -> [[muteki, "muTEki", 0, 6],
 *     #     [adventure_of_link, "リンクの冒険", 7, 18],
 *     #     [gaxtu, "ガッ", 42, 6]]
 */
static VALUE
rb_grn_patricia_trie_scan (VALUE self, VALUE rb_string)
{
    grn_ctx *context;
    grn_obj *table;
    VALUE rb_result = Qnil;
    grn_pat_scan_hit hits[1024];
    const char *string;
    long string_length;
    grn_bool block_given;

    string = StringValuePtr(rb_string);
    string_length = RSTRING_LEN(rb_string);

    rb_grn_table_key_support_deconstruct(SELF(self), &table, &context,
					 NULL, NULL, NULL,
					 NULL, NULL, NULL,
					 NULL);

    block_given = rb_block_given_p();
    if (!block_given)
	rb_result = rb_ary_new();

    while (string_length > 0) {
	const char *rest;
	int i, n_hits;
	unsigned int previous_offset = 0;

	n_hits = grn_pat_scan(context, (grn_pat *)table,
			      string, string_length,
			      hits, sizeof(hits) / sizeof(*hits),
			      &rest);
	for (i = 0; i < n_hits; i++) {
	    VALUE record, term, matched_info;

	    if (hits[i].offset < previous_offset)
		continue;

	    record = rb_grn_record_new(self, hits[i].id, Qnil);
	    term = rb_grn_context_rb_string_new(context,
						string + hits[i].offset,
						hits[i].length);
	    matched_info = rb_ary_new3(4,
				       record,
				       term,
				       UINT2NUM(hits[i].offset),
				       UINT2NUM(hits[i].length));
	    if (block_given) {
		rb_yield(matched_info);
	    } else {
		rb_ary_push(rb_result, matched_info);
	    }
	    previous_offset = hits[i].offset;
	}
	string_length -= rest - string;
	string = rest;
    }

    return rb_result;
}
예제 #2
0
static grn_rc
grn_pat_tag_keys(grn_ctx *ctx, grn_obj *keywords,
                 const char *string, unsigned int string_length,
                 const char **open_tags, unsigned int *open_tag_lengths,
                 const char **close_tags, unsigned int *close_tag_lengths,
                 unsigned int n_tags,
                 grn_obj *highlighted,
                 grn_bool use_html_escape)
{
  while (string_length > 0) {
#define MAX_N_HITS 16
    grn_pat_scan_hit hits[MAX_N_HITS];
    const char *rest;
    unsigned int i, n_hits;
    unsigned int previous = 0;
    size_t chunk_length;

    n_hits = grn_pat_scan(ctx, (grn_pat *)keywords,
                          string, string_length,
                          hits, MAX_N_HITS, &rest);
    for (i = 0; i < n_hits; i++) {
      unsigned int nth_tag;
      if (hits[i].offset - previous > 0) {
        grn_pat_tag_keys_put_original_text(ctx,
                                           highlighted,
                                           string + previous,
                                           hits[i].offset - previous,
                                           use_html_escape);
      }
      nth_tag = ((hits[i].id - 1) % n_tags);
      GRN_TEXT_PUT(ctx, highlighted,
                   open_tags[nth_tag], open_tag_lengths[nth_tag]);
      grn_pat_tag_keys_put_original_text(ctx,
                                         highlighted,
                                         string + hits[i].offset,
                                         hits[i].length,
                                         use_html_escape);
      GRN_TEXT_PUT(ctx, highlighted,
                   close_tags[nth_tag], close_tag_lengths[nth_tag]);
      previous = hits[i].offset + hits[i].length;
    }

    chunk_length = rest - string;
    if (chunk_length - previous > 0) {
      grn_pat_tag_keys_put_original_text(ctx,
                                         highlighted,
                                         string + previous,
                                         string_length - previous,
                                         use_html_escape);
    }
    string_length -= chunk_length;
    string = rest;
#undef MAX_N_HITS
  }

  return GRN_SUCCESS;
}
예제 #3
0
static grn_obj *
yangram_next(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args,
             grn_user_data *user_data)
{
  grn_yangram_tokenizer *tokenizer = user_data->ptr;
  const unsigned char *string_end = tokenizer->end;
  const unsigned char *token_top = tokenizer->next;
  const unsigned char *token_next = token_top;
  const unsigned char *token_tail = token_top;
  int token_size = 0;
  grn_bool is_token_grouped = GRN_FALSE;
  const unsigned char *token_ctypes = NULL;
  unsigned int ctypes_skip_size;
  int char_length = 0;
  grn_tokenizer_status status = 0;
  grn_bool is_token_hit = GRN_FALSE;
  grn_obj *lexicon = args[0];

  if (tokenizer->phrase_table) {
    if (tokenizer->nhits > 0 &&
        token_top - (const unsigned char *)tokenizer->scan_start >
        tokenizer->hits[tokenizer->current_hit].offset) {
      tokenizer->current_hit++;
    }
    if (tokenizer->current_hit >= tokenizer->nhits) {
      tokenizer->scan_start = tokenizer->scan_rest;
      unsigned int scan_rest_length = tokenizer->end - (const unsigned char *)tokenizer->scan_rest;
      if (scan_rest_length > 0) {
        tokenizer->nhits = grn_pat_scan(ctx, (grn_pat *)tokenizer->phrase_table,
                                        tokenizer->scan_rest,
                                        scan_rest_length,
                                        tokenizer->hits, MAX_N_HITS, &(tokenizer->scan_rest));
        tokenizer->current_hit = 0;
      }
    }
    if (tokenizer->nhits > 0 &&
        tokenizer->current_hit < tokenizer->nhits &&
        token_top - (const unsigned char *)tokenizer->scan_start ==
        tokenizer->hits[tokenizer->current_hit].offset) {
      is_token_hit = GRN_TRUE;
    }
  }

  if (tokenizer->ctypes) {
    token_ctypes = tokenizer->ctypes + tokenizer->ctypes_next;
  } else {
    token_ctypes = NULL;
  }

  if (is_token_hit) {
   token_size = forward_scan_hit_token_tail(ctx, tokenizer, &token_tail,
                                            tokenizer->hits[tokenizer->current_hit].length);
   token_next = token_tail;
   tokenizer->current_hit++;
  } else {
    is_token_grouped = is_token_group(tokenizer, token_ctypes);
    if (is_token_grouped) {
      token_size = forward_grouped_token_tail(ctx, tokenizer, token_ctypes, &token_tail);
      token_next = token_tail;
    } else {
      token_size = forward_ngram_token_tail(ctx, tokenizer, token_ctypes, &token_tail);
      char_length = grn_plugin_charlen(ctx, (char *)token_next,
                                       tokenizer->rest_length,
                                       tokenizer->query->encoding);
      token_next += char_length;
    }
  }

  if (token_top == token_tail || token_next == string_end) {
    ctypes_skip_size = 0;
  } else {
    if (is_token_grouped || is_token_hit) {
      ctypes_skip_size = token_size;
    } else {
      ctypes_skip_size = 1;
    }
  }

  if (tokenizer->use_vgram > 0 && !is_token_grouped) {
    grn_bool maybe_vgram = GRN_FALSE;

    grn_id id;
    id = grn_table_get(ctx, tokenizer->vgram_table,
                       (const char *)token_top, token_tail - token_top);
    if (id) {
      maybe_vgram = GRN_TRUE;
    }

    if (tokenizer->use_vgram >= VGRAM_BOTH && !maybe_vgram) {
      if (token_tail < string_end &&
          !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) {
        grn_id id;
        const unsigned char *token_next_tail;
        char_length = grn_plugin_charlen(ctx, (char *)token_tail,
                                         tokenizer->rest_length,
                                         tokenizer->query->encoding);
        token_next_tail = token_tail + char_length;
        id = grn_table_get(ctx, tokenizer->vgram_table,
                           (const char *)token_next, token_next_tail - token_next);
        if (id) {
          maybe_vgram = GRN_TRUE;
        }
      } else if (token_tail == string_end &&
                 tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) {
        maybe_vgram = GRN_TRUE;
      }
    }

    if (maybe_vgram) {
      if (token_tail < string_end &&
          !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) {
        char_length = grn_plugin_charlen(ctx, (char *)token_tail,
                                         tokenizer->rest_length,
                                         tokenizer->query->encoding);
        token_size++;
        token_tail += char_length;


        if (tokenizer->use_vgram == VGRAM_QUAD) {
          if (token_tail < string_end &&
              !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) {
            id = grn_table_get(ctx, tokenizer->vgram_table,
                               (const char *)token_top, token_tail - token_top);
            if (id) {
              char_length = grn_plugin_charlen(ctx, (char *)token_tail,
                                               tokenizer->rest_length,
                                               tokenizer->query->encoding);
              token_size++;
              token_tail += char_length;
            }
          } else {
            if (tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) {
              grn_id tid;
              tid = grn_table_get(ctx, lexicon,
                                  (const char *)token_top, token_tail - token_top);
              if (tid == GRN_ID_NIL) {
                int added;
                grn_table_add(ctx, lexicon,
                              (const char *)token_top, token_tail - token_top, &added);
              }
              status |= GRN_TOKEN_FORCE_PREFIX;
            }
          }
        }
      } else {
        if (tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) {
          grn_id tid;
          tid = grn_table_get(ctx, lexicon,
                             (const char *)token_top, token_tail - token_top);
          if (tid == GRN_ID_NIL) {
            int added;
            grn_table_add(ctx, lexicon,
                          (const char *)token_top, token_tail - token_top, &added);
          }
          status |= GRN_TOKEN_FORCE_PREFIX;
        }
      }
    }
  }

  if (token_top == token_tail || token_next == string_end) {
    status |= GRN_TOKEN_LAST;
  }

  if (token_tail == string_end) {
    status |= GRN_TOKEN_REACH_END;
  }

  if (!is_token_grouped && !is_token_hit && token_size < tokenizer->ngram_unit) {
    status |= GRN_TOKEN_UNMATURED;
  }

  if (tokenizer->pushed_token_tail &&
      token_top < tokenizer->pushed_token_tail) {
    status |= GRN_TOKEN_OVERLAP;
    if (tokenizer->skip_overlap &&
        !grn_ii_overlap_token_skip_enable &&
        !(status & GRN_TOKEN_REACH_END) &&
        !(status & GRN_TOKEN_SKIP_WITH_POSITION) &&
      tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) {
      if (token_tail <= tokenizer->pushed_token_tail) {
        status |= GRN_TOKEN_SKIP;
      } else {
        if (!is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) {
          status |= GRN_TOKEN_SKIP;
        }
      }
    }
  }

  if (!(status & GRN_TOKEN_SKIP) &&
      !(status & GRN_TOKEN_SKIP_WITH_POSITION)) {
    tokenizer->pushed_token_tail = token_tail;
  }

  tokenizer->next = token_next;
  tokenizer->rest_length = string_end - token_next;
  tokenizer->ctypes_next = tokenizer->ctypes_next + ctypes_skip_size;

  grn_tokenizer_token_push(ctx,
                           &(tokenizer->token),
                           (const char *)token_top,
                           token_tail - token_top,
                           status);

  return NULL;
}