Example #1
0
static grn_rc
sequential_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *column, grn_obj *query,
                        uint32_t max_distance, uint32_t prefix_match_size,
                        uint32_t max_expansion, int flags, grn_obj *res, grn_operator op)
{
  grn_table_cursor *tc;
  char *sx = GRN_TEXT_VALUE(query);
  char *ex = GRN_BULK_CURR(query);

  if (op == GRN_OP_AND) {
    tc = grn_table_cursor_open(ctx, res, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID);
  } else {
    tc = grn_table_cursor_open(ctx, table, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID);
  }
  if (tc) {
    grn_id id;
    grn_obj value;
    score_heap *heap;
    int i, n;
    GRN_TEXT_INIT(&value, 0);

    heap = score_heap_open(ctx, SCORE_HEAP_SIZE);
    if (!heap) {
      grn_table_cursor_close(ctx, tc);
      grn_obj_unlink(ctx, &value);
      return GRN_NO_MEMORY_AVAILABLE;
    }

    while ((id = grn_table_cursor_next(ctx, tc))) {
      unsigned int distance = 0;
      grn_obj *domain;
      GRN_BULK_REWIND(&value);
      grn_obj_get_value(ctx, column, id, &value);
      domain = grn_ctx_at(ctx, ((&value))->header.domain);
      if ((&(value))->header.type == GRN_VECTOR) {
        n = grn_vector_size(ctx, &value);
        for (i = 0; i < n; i++) {
          unsigned int length;
          const char *vector_value = NULL;
          length = grn_vector_get_element(ctx, &value, i, &vector_value, NULL, NULL);

          if (!prefix_match_size ||
              (prefix_match_size > 0 && length >= prefix_match_size &&
               !memcmp(sx, vector_value, prefix_match_size))) {
            distance = calc_edit_distance(ctx, sx, ex,
                                          (char *)vector_value,
                                          (char *)vector_value + length, flags);
            if (distance <= max_distance) {
              score_heap_push(ctx, heap, id, distance);
              break;
            }
          }
        }
      } else if ((&(value))->header.type == GRN_UVECTOR &&
                  grn_obj_is_table(ctx, domain)) {
        n = grn_vector_size(ctx, &value);
        for (i = 0; i < n; i++) {
          grn_id rid;
          char key_name[GRN_TABLE_MAX_KEY_SIZE];
          int key_length;
          rid = grn_uvector_get_element(ctx, &value, i, NULL);
          key_length = grn_table_get_key(ctx, domain, rid, key_name, GRN_TABLE_MAX_KEY_SIZE);

          if (!prefix_match_size ||
              (prefix_match_size > 0 && key_length >= prefix_match_size &&
               !memcmp(sx, key_name, prefix_match_size))) {
            distance = calc_edit_distance(ctx, sx, ex,
                                          key_name, key_name + key_length, flags);
            if (distance <= max_distance) {
              score_heap_push(ctx, heap, id, distance);
              break;
            }
          }
        }
      } else {
        if (grn_obj_is_reference_column(ctx, column)) {
          grn_id rid;
          char key_name[GRN_TABLE_MAX_KEY_SIZE];
          int key_length;
          rid = GRN_RECORD_VALUE(&value);
          key_length = grn_table_get_key(ctx, domain, rid, key_name, GRN_TABLE_MAX_KEY_SIZE);
          if (!prefix_match_size ||
              (prefix_match_size > 0 && key_length >= prefix_match_size &&
               !memcmp(sx, key_name, prefix_match_size))) {
            distance = calc_edit_distance(ctx, sx, ex,
                                          key_name, key_name + key_length, flags);
            if (distance <= max_distance) {
              score_heap_push(ctx, heap, id, distance);
            }
          }
        } else {
          if (!prefix_match_size ||
              (prefix_match_size > 0 && GRN_TEXT_LEN(&value) >= prefix_match_size &&
               !memcmp(sx, GRN_TEXT_VALUE(&value), prefix_match_size))) {
            distance = calc_edit_distance(ctx, sx, ex,
                                          GRN_TEXT_VALUE(&value),
                                          GRN_BULK_CURR(&value), flags);
            if (distance <= max_distance) {
              score_heap_push(ctx, heap, id, distance);
            }
          }
        }
      }
      grn_obj_unlink(ctx, domain);
    }
    grn_table_cursor_close(ctx, tc);
    grn_obj_unlink(ctx, &value);

    for (i = 0; i < heap->n_entries; i++) {
      if (max_expansion > 0 && i >= max_expansion) {
        break;
      }
      {
        grn_posting posting;
        posting.rid = heap->nodes[i].id;
        posting.sid = 1;
        posting.pos = 0;
        posting.weight = max_distance - heap->nodes[i].score;
        grn_ii_posting_add(ctx, &posting, (grn_hash *)res, op);
      }
    }
    grn_ii_resolve_sel_and(ctx, (grn_hash *)res, op);
    score_heap_close(ctx, heap);
  }

  return GRN_SUCCESS;
}
/*
 * It gets a value of variable size column value for the record that
 * ID is _id_.
 *
 * @example Gets weight vector value
 *    Groonga::Schema.define do |schema|
 *      schema.create_table("Products",
 *                          :type => :patricia_trie,
 *                          :key_type => "ShortText") do |table|
 *        # This is weight vector.
 *        # ":with_weight => true" is important to store weight value.
 *        table.short_text("tags",
 *                         :type => :vector,
 *                         :with_weight => true)
 *      end
 *    end
 *
 *    products = Groonga["Products"]
 *    rroonga = products.add("Rroonga")
 *    rroonga.tags = [
 *      {
 *        :value  => "ruby",
 *        :weight => 100,
 *      },
 *      {
 *        :value  => "groonga",
 *        :weight => 10,
 *      },
 *    ]
 *
 *    p rroonga.tags
 *    # => [
 *    #      {:value => "ruby",    :weight => 100},
 *    #      {:value => "groonga", :weight => 10}
 *    #    ]
 *
 * @overload [](id)
 *   @param [Integer, Record] id The record ID.
 *   @return [Array<Hash<Symbol, String>>] An array of value if the column
 *     is a weight vector column.
 *     Each value is a Hash like the following form:
 *
 *     <pre>
 *     {
 *       :value  => [KEY],
 *       :weight => [WEIGHT],
 *     }
 *     </pre>
 *
 *     @[KEY]@ is the key of the table that is specified as range on
 *     creating the weight vector.
 *
 *     @[WEIGHT]@ is a positive integer.
 *
 *   @return [::Object] See {Groonga::Object#[]} for columns except
 *     weight vector column.
 *
 * @since 4.0.1.
 */
static VALUE
rb_grn_variable_size_column_array_reference (VALUE self, VALUE rb_id)
{
    grn_ctx *context = NULL;
    grn_obj *column, *range;
    grn_id id;
    grn_obj *value;
    VALUE rb_value;
    VALUE rb_range;
    unsigned int i, n;

    rb_grn_variable_size_column_deconstruct(SELF(self), &column, &context,
                                            NULL, NULL, &value, NULL,
                                            NULL, &range);

    if (!(column->header.flags & GRN_OBJ_WITH_WEIGHT)) {
        return rb_call_super(1, &rb_id);
    }

    id = RVAL2GRNID(rb_id, context, range, self);

    grn_obj_reinit(context, value,
                   value->header.domain,
                   value->header.flags | GRN_OBJ_VECTOR);
    grn_obj_get_value(context, column, id, value);
    rb_grn_context_check(context, self);

    rb_range = GRNTABLE2RVAL(context, range, GRN_FALSE);

    n = grn_vector_size(context, value);
    rb_value = rb_ary_new2(n);
    for (i = 0; i < n; i++) {
        VALUE rb_element_value;
        unsigned int weight = 0;
        grn_id domain;
        VALUE rb_element;

        if (value->header.type == GRN_UVECTOR) {
            grn_id id;
            id = grn_uvector_get_element(context, value, i, &weight);
            rb_element_value = rb_grn_record_new(rb_range, id, Qnil);
        } else {
            const char *element_value;
            unsigned int element_value_length;
            element_value_length = grn_vector_get_element(context,
                                                          value,
                                                          i,
                                                          &element_value,
                                                          &weight,
                                                          &domain);
            rb_element_value = rb_str_new(element_value, element_value_length);
        }

        rb_element = rb_hash_new();
        rb_hash_aset(rb_element,
                     ID2SYM(rb_intern("value")),
                     rb_element_value);
        rb_hash_aset(rb_element,
                     ID2SYM(rb_intern("weight")),
                     UINT2NUM(weight));

        rb_ary_push(rb_value, rb_element);
    }

    return rb_value;
}
Example #3
0
static grn_obj *
command_tag_synonym(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args,
                    GNUC_UNUSED grn_user_data *user_data)
{
  GNUC_UNUSED grn_obj *flags = grn_ctx_pop(ctx);
  grn_obj *newvalue = grn_ctx_pop(ctx);
  grn_obj *oldvalue = grn_ctx_pop(ctx);
  GNUC_UNUSED grn_obj *id = grn_ctx_pop(ctx);
  grn_obj buf;
  grn_obj record;
  grn_obj *domain;
  grn_obj *table;
  grn_obj *column;
  int i,n;

  if (GRN_BULK_VSIZE(newvalue) == 0 || GRN_INT32_VALUE(flags) == 0) {
    return NULL;
  }

  table = grn_ctx_at(ctx, oldvalue->header.domain);
  if (table && !is_table(table)) {
    GRN_PLUGIN_LOG(ctx, GRN_LOG_WARNING,
                   "[tag-synonym] "
                   "hooked column must be reference type");
    return NULL;
  }

  column = grn_obj_column(ctx,
                          table,
                          SYNONYM_COLUMN_NAME,
                          SYNONYM_COLUMN_NAME_LEN);
  if (!column) {
    GRN_PLUGIN_LOG(ctx, GRN_LOG_WARNING,
                   "[tag-synonym] "
                   "couldn't open synonym column");
    return NULL;
  }

  GRN_TEXT_INIT(&buf, 0);
  domain = grn_ctx_at(ctx, newvalue->header.domain);
  if (domain && is_string(domain)) {
    GRN_RECORD_INIT(&record, GRN_OBJ_VECTOR, oldvalue->header.domain);
    grn_table_tokenize(ctx, table, GRN_TEXT_VALUE(newvalue), GRN_TEXT_LEN(newvalue), &record, GRN_TRUE);
  } else if (newvalue->header.type == GRN_UVECTOR) {
    record = *newvalue;
  }

  if (is_string(domain) || newvalue->header.type == GRN_UVECTOR) {
    grn_obj value;

    GRN_RECORD_INIT(newvalue, GRN_OBJ_VECTOR, oldvalue->header.domain);
    GRN_UINT32_INIT(&value, 0);
    n = grn_vector_size(ctx, &record);
    for (i = 0; i < n; i++) {
      grn_id tid;
      tid = grn_uvector_get_element(ctx, &record, i, NULL);
      GRN_BULK_REWIND(&value);
      grn_obj_get_value(ctx, column, tid, &value);
      if (GRN_UINT32_VALUE(&value)) {
        GRN_PLUGIN_LOG(ctx, GRN_LOG_INFO,
                       "[tag-synonym] "
                       "changed: tid %d -> %d", tid, GRN_UINT32_VALUE(&value));
        tid = GRN_UINT32_VALUE(&value);
      }
      grn_uvector_add_element(ctx, newvalue, tid, 0);
    }
    grn_obj_unlink(ctx, &value);
  } else {
    grn_id tid;
    grn_obj value;
    tid = GRN_RECORD_VALUE(newvalue);
    GRN_UINT32_INIT(&value, 0);
    grn_obj_get_value(ctx, column, tid, &value);
    if (GRN_UINT32_VALUE(&value)) {
      GRN_PLUGIN_LOG(ctx, GRN_LOG_INFO,
                     "[tag-synonym] "
                     "changed: tid %d -> %d", tid, GRN_UINT32_VALUE(&value));
      tid = GRN_UINT32_VALUE(&value);
      GRN_BULK_REWIND(newvalue);
      GRN_RECORD_SET(ctx, newvalue, tid);
    }
    grn_obj_unlink(ctx, &value);
  }
  grn_obj_unlink(ctx, &buf);

  return NULL;
}
Example #4
0
static grn_bool
exec_regexp_uvector_bulk(grn_ctx *ctx, grn_obj *uvector, grn_obj *pattern)
{
#ifdef GRN_SUPPORT_REGEXP
  grn_bool matched = GRN_FALSE;
  unsigned int i, size;
  OnigRegex regex;
  grn_obj *domain;
  grn_obj *normalizer;
  grn_obj *normalizer_auto = NULL;

  size = grn_uvector_size(ctx, uvector);
  if (size == 0) {
    return GRN_FALSE;
  }

  regex = grn_onigmo_new(ctx,
                         GRN_TEXT_VALUE(pattern),
                         GRN_TEXT_LEN(pattern),
                         GRN_ONIGMO_OPTION_DEFAULT,
                         GRN_ONIGMO_SYNTAX_DEFAULT,
                         "[operator]");
  if (!regex) {
    return GRN_FALSE;
  }

  domain = grn_ctx_at(ctx, uvector->header.domain);
  if (!domain) {
    onig_free(regex);
    return GRN_FALSE;
  }

  grn_table_get_info(ctx, domain, NULL, NULL, NULL, &normalizer, NULL);
  if (!normalizer) {
    normalizer_auto = grn_ctx_get(ctx, GRN_NORMALIZER_AUTO_NAME, -1);
  }

  for (i = 0; i < size; i++) {
    grn_id record_id;
    char key[GRN_TABLE_MAX_KEY_SIZE];
    int key_size;

    record_id = grn_uvector_get_element(ctx, uvector, i, NULL);
    key_size = grn_table_get_key(ctx, domain, record_id,
                                 key, GRN_TABLE_MAX_KEY_SIZE);
    if (key_size == 0) {
      continue;
    }

    if (normalizer) {
      matched = regexp_is_match(ctx, regex, key, key_size);
    } else {
      grn_obj *norm_key;
      const char *norm_key_raw;
      unsigned int norm_key_raw_length_in_bytes;

      norm_key = grn_string_open(ctx, key, key_size, normalizer_auto, 0);
      grn_string_get_normalized(ctx, norm_key,
                                &norm_key_raw,
                                &norm_key_raw_length_in_bytes,
                                NULL);
      matched = regexp_is_match(ctx, regex,
                                norm_key_raw,
                                norm_key_raw_length_in_bytes);
      grn_obj_unlink(ctx, norm_key);
    }

    if (matched) {
      break;
    }
  }

  if (normalizer_auto) {
    grn_obj_unlink(ctx, normalizer_auto);
  }

  grn_obj_unlink(ctx, domain);

  onig_free(regex);

  return matched;
#else /* GRN_SUPPORT_REGEXP */
  return GRN_FALSE;
#endif /* GRN_SUPPORT_REGEXP */
}
Example #5
0
static grn_obj *
func_vector_slice(grn_ctx *ctx, int n_args, grn_obj **args,
                  grn_user_data *user_data)
{
  grn_obj *target;
  grn_obj *from_raw = NULL;
  grn_obj *length_raw = NULL;
  int64_t from = 0;
  int64_t length = -1;
  uint32_t to = 0;
  uint32_t size = 0;
  grn_obj *slice;

  if (n_args < 2 || n_args > 3) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                     "vector_slice(): wrong number of arguments (%d for 2..3)",
                     n_args);
    return NULL;
  }

  target = args[0];
  from_raw = args[1];
  if (n_args == 3) {
    length_raw = args[2];
  }
  switch (target->header.type) {
  case GRN_VECTOR :
  case GRN_PVECTOR :
  case GRN_UVECTOR :
    size = grn_vector_size(ctx, target);
    break;
  default :
    {
      grn_obj inspected;

      GRN_TEXT_INIT(&inspected, 0);
      grn_inspect(ctx, target, &inspected);
      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                       "vector_slice(): target object must be vector: <%.*s>",
                       (int)GRN_TEXT_LEN(&inspected),
                       GRN_TEXT_VALUE(&inspected));
      GRN_OBJ_FIN(ctx, &inspected);
      return NULL;
    }
    break;
  }

  if (!grn_type_id_is_number_family(ctx, from_raw->header.domain)) {
    grn_obj inspected;

    GRN_TEXT_INIT(&inspected, 0);
    grn_inspect(ctx, &inspected, from_raw);
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                     "vector_slice(): from must be a number: <%.*s>",
                     (int)GRN_TEXT_LEN(&inspected),
                     GRN_TEXT_VALUE(&inspected));
    GRN_OBJ_FIN(ctx, &inspected);
    return NULL;
  }
  if (from_raw->header.domain == GRN_DB_INT32) {
    from = GRN_INT32_VALUE(from_raw);
  } else if (from_raw->header.domain == GRN_DB_INT64) {
    from = GRN_INT64_VALUE(from_raw);
  } else {
    grn_obj buffer;
    grn_rc rc;

    GRN_INT64_INIT(&buffer, 0);
    rc = grn_obj_cast(ctx, from_raw, &buffer, GRN_FALSE);
    if (rc == GRN_SUCCESS) {
      from = GRN_INT64_VALUE(&buffer);
    }
    GRN_OBJ_FIN(ctx, &buffer);

    if (rc != GRN_SUCCESS) {
      grn_obj inspected;

      GRN_TEXT_INIT(&inspected, 0);
      grn_inspect(ctx, &inspected, from_raw);
      GRN_PLUGIN_ERROR(ctx, rc,
                       "vector_slice(): "
                       "failed to cast from value to number: <%.*s>",
                       (int)GRN_TEXT_LEN(&inspected),
                       GRN_TEXT_VALUE(&inspected));
      GRN_OBJ_FIN(ctx, &inspected);
      return NULL;
    }
  }

  if (length_raw) {
    if (!grn_type_id_is_number_family(ctx, length_raw->header.domain)) {
      grn_obj inspected;

      GRN_TEXT_INIT(&inspected, 0);
      grn_inspect(ctx, &inspected, length_raw);
      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                       "vector_slice(): length must be a number: <%.*s>",
                       (int)GRN_TEXT_LEN(&inspected),
                       GRN_TEXT_VALUE(&inspected));
      GRN_OBJ_FIN(ctx, &inspected);
      return NULL;
    }
    if (length_raw->header.domain == GRN_DB_INT32) {
      length = GRN_INT32_VALUE(length_raw);
    } else if (length_raw->header.domain == GRN_DB_INT64) {
      length = GRN_INT64_VALUE(length_raw);
    } else {
      grn_obj buffer;
      grn_rc rc;

      GRN_INT64_INIT(&buffer, 0);
      rc = grn_obj_cast(ctx, length_raw, &buffer, GRN_FALSE);
      if (rc == GRN_SUCCESS) {
        length = GRN_INT64_VALUE(&buffer);
      }
      GRN_OBJ_FIN(ctx, &buffer);

      if (rc != GRN_SUCCESS) {
        grn_obj inspected;

        GRN_TEXT_INIT(&inspected, 0);
        grn_inspect(ctx, &inspected, length_raw);
        GRN_PLUGIN_ERROR(ctx, rc,
                         "vector_slice(): "
                         "failed to cast length value to number: <%.*s>",
                         (int)GRN_TEXT_LEN(&inspected),
                         GRN_TEXT_VALUE(&inspected));
        GRN_OBJ_FIN(ctx, &inspected);
        return NULL;
      }
    }
  }

  slice = grn_plugin_proc_alloc(ctx, user_data, target->header.domain, GRN_OBJ_VECTOR);
  if (!slice) {
    return NULL;
  }

  if (target->header.flags & GRN_OBJ_WITH_WEIGHT) {
    slice->header.flags |= GRN_OBJ_WITH_WEIGHT;
  }

  if (length < 0) {
    length = size + length + 1;
  }

  if (length > size) {
    length = size;
  }

  if (length <= 0) {
    return slice;
  }

  while (from < 0) {
    from += size;
  }

  to = from + length;
  if (to > size) {
    to = size;
  }

  switch (target->header.type) {
  case GRN_VECTOR :
    {
      unsigned int i;
      for (i = from; i < to; i++) {
        const char *content;
        unsigned int content_length;
        unsigned int weight;
        grn_id domain;
        content_length = grn_vector_get_element(ctx, target, i,
                                                &content, &weight, &domain);
        grn_vector_add_element(ctx, slice,
                               content, content_length, weight, domain);
      }
    }
    break;
  case GRN_PVECTOR :
    {
      unsigned int i;
      for (i = from; i < to; i++) {
        grn_obj *element = GRN_PTR_VALUE_AT(target, i);
        GRN_PTR_PUT(ctx, slice, element);
      }
    }
    break;
  case GRN_UVECTOR :
    {
      unsigned int i;
      for (i = from; i < to; i++) {
        grn_id id;
        unsigned int weight;
        id = grn_uvector_get_element(ctx, target, i, &weight);
        grn_uvector_add_element(ctx, slice, id, weight);
      }
    }
    break;
  }

  return slice;
}