Beispiel #1
0
static grn_rc
sequential_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *column, grn_obj *query,
                        uint32_t max_distance, uint32_t prefix_match_size,
                        uint32_t max_expansion, int flags, grn_obj *res, grn_operator op)
{
  grn_table_cursor *tc;
  char *sx = GRN_TEXT_VALUE(query);
  char *ex = GRN_BULK_CURR(query);

  if (op == GRN_OP_AND) {
    tc = grn_table_cursor_open(ctx, res, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID);
  } else {
    tc = grn_table_cursor_open(ctx, table, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID);
  }
  if (tc) {
    grn_id id;
    grn_obj value;
    score_heap *heap;
    int i, n;
    GRN_TEXT_INIT(&value, 0);

    heap = score_heap_open(ctx, SCORE_HEAP_SIZE);
    if (!heap) {
      grn_table_cursor_close(ctx, tc);
      grn_obj_unlink(ctx, &value);
      return GRN_NO_MEMORY_AVAILABLE;
    }

    while ((id = grn_table_cursor_next(ctx, tc))) {
      unsigned int distance = 0;
      grn_obj *domain;
      GRN_BULK_REWIND(&value);
      grn_obj_get_value(ctx, column, id, &value);
      domain = grn_ctx_at(ctx, ((&value))->header.domain);
      if ((&(value))->header.type == GRN_VECTOR) {
        n = grn_vector_size(ctx, &value);
        for (i = 0; i < n; i++) {
          unsigned int length;
          const char *vector_value = NULL;
          length = grn_vector_get_element(ctx, &value, i, &vector_value, NULL, NULL);

          if (!prefix_match_size ||
              (prefix_match_size > 0 && length >= prefix_match_size &&
               !memcmp(sx, vector_value, prefix_match_size))) {
            distance = calc_edit_distance(ctx, sx, ex,
                                          (char *)vector_value,
                                          (char *)vector_value + length, flags);
            if (distance <= max_distance) {
              score_heap_push(ctx, heap, id, distance);
              break;
            }
          }
        }
      } else if ((&(value))->header.type == GRN_UVECTOR &&
                  grn_obj_is_table(ctx, domain)) {
        n = grn_vector_size(ctx, &value);
        for (i = 0; i < n; i++) {
          grn_id rid;
          char key_name[GRN_TABLE_MAX_KEY_SIZE];
          int key_length;
          rid = grn_uvector_get_element(ctx, &value, i, NULL);
          key_length = grn_table_get_key(ctx, domain, rid, key_name, GRN_TABLE_MAX_KEY_SIZE);

          if (!prefix_match_size ||
              (prefix_match_size > 0 && key_length >= prefix_match_size &&
               !memcmp(sx, key_name, prefix_match_size))) {
            distance = calc_edit_distance(ctx, sx, ex,
                                          key_name, key_name + key_length, flags);
            if (distance <= max_distance) {
              score_heap_push(ctx, heap, id, distance);
              break;
            }
          }
        }
      } else {
        if (grn_obj_is_reference_column(ctx, column)) {
          grn_id rid;
          char key_name[GRN_TABLE_MAX_KEY_SIZE];
          int key_length;
          rid = GRN_RECORD_VALUE(&value);
          key_length = grn_table_get_key(ctx, domain, rid, key_name, GRN_TABLE_MAX_KEY_SIZE);
          if (!prefix_match_size ||
              (prefix_match_size > 0 && key_length >= prefix_match_size &&
               !memcmp(sx, key_name, prefix_match_size))) {
            distance = calc_edit_distance(ctx, sx, ex,
                                          key_name, key_name + key_length, flags);
            if (distance <= max_distance) {
              score_heap_push(ctx, heap, id, distance);
            }
          }
        } else {
          if (!prefix_match_size ||
              (prefix_match_size > 0 && GRN_TEXT_LEN(&value) >= prefix_match_size &&
               !memcmp(sx, GRN_TEXT_VALUE(&value), prefix_match_size))) {
            distance = calc_edit_distance(ctx, sx, ex,
                                          GRN_TEXT_VALUE(&value),
                                          GRN_BULK_CURR(&value), flags);
            if (distance <= max_distance) {
              score_heap_push(ctx, heap, id, distance);
            }
          }
        }
      }
      grn_obj_unlink(ctx, domain);
    }
    grn_table_cursor_close(ctx, tc);
    grn_obj_unlink(ctx, &value);

    for (i = 0; i < heap->n_entries; i++) {
      if (max_expansion > 0 && i >= max_expansion) {
        break;
      }
      {
        grn_posting posting;
        posting.rid = heap->nodes[i].id;
        posting.sid = 1;
        posting.pos = 0;
        posting.weight = max_distance - heap->nodes[i].score;
        grn_ii_posting_add(ctx, &posting, (grn_hash *)res, op);
      }
    }
    grn_ii_resolve_sel_and(ctx, (grn_hash *)res, op);
    score_heap_close(ctx, heap);
  }

  return GRN_SUCCESS;
}
static mrb_value
mrb_grn_index_cursor_select(mrb_state *mrb, mrb_value self)
{
  grn_ctx *ctx = (grn_ctx *)mrb->ud;
  mrb_value mrb_result_set;
  mrb_value mrb_options;
  grn_obj *index_cursor;
  grn_obj *expr = NULL;
  grn_obj *expr_variable = NULL;
  int offset = 0;
  int limit = 10;
  int n_matched_records = 0;
  mrb_value mrb_index;
  grn_obj *index;
  grn_obj *lexicon;
  grn_obj *data_table;
  grn_hash *result_set;
  grn_posting *posting;
  grn_id term_id;
  grn_operator op = GRN_OP_OR;

  mrb_get_args(mrb, "o|H", &mrb_result_set, &mrb_options);

  index_cursor = DATA_PTR(self);
  result_set = DATA_PTR(mrb_result_set);

  if (!mrb_nil_p(mrb_options)) {
    mrb_value mrb_expr;
    mrb_value mrb_offset;
    mrb_value mrb_limit;

    mrb_expr = grn_mrb_options_get_lit(mrb, mrb_options, "expression");
    if (!mrb_nil_p(mrb_expr)) {
      expr = DATA_PTR(mrb_expr);
      expr_variable = grn_expr_get_var_by_offset(ctx, expr, 0);
    }

    mrb_offset = grn_mrb_options_get_lit(mrb, mrb_options, "offset");
    if (!mrb_nil_p(mrb_offset)) {
      offset = mrb_fixnum(mrb_offset);
    }

    mrb_limit = grn_mrb_options_get_lit(mrb, mrb_options, "limit");
    if (!mrb_nil_p(mrb_limit)) {
      limit = mrb_fixnum(mrb_limit);
    }
  }

  if (limit <= 0) {
    return mrb_fixnum_value(n_matched_records);
  }

  mrb_index = mrb_iv_get(mrb, self, mrb_intern_lit(mrb, "@index"));
  index = DATA_PTR(mrb_index);
  lexicon = ((grn_ii *)index)->lexicon;
  data_table = grn_ctx_at(ctx, grn_obj_get_range(ctx, index));

  while ((posting = grn_index_cursor_next(ctx, index_cursor, &term_id))) {
    if (expr) {
      grn_bool matched_raw;
      grn_obj *matched;

      GRN_RECORD_SET(ctx, expr_variable, posting->rid);
      matched = grn_expr_exec(ctx, expr, 0);
      if (!matched) {
        grn_mrb_ctx_check(mrb);
        continue;
      }
      GRN_TRUEP(ctx, matched, matched_raw);
      if (!matched_raw) {
        continue;
      }
    }
    n_matched_records++;
    if (offset > 0) {
      offset--;
      continue;
    }
    grn_ii_posting_add(ctx, (grn_ii_posting *)posting, result_set, op);
    limit--;
    if (limit == 0) {
      break;
    }
  }
  grn_ii_resolve_sel_and(ctx, result_set, op);

  return mrb_fixnum_value(n_matched_records);
}