示例#1
0
文件: query.c 项目: darashi/groonga
grn_rc
grn_query_scan(grn_ctx *ctx, grn_query *q, const char **strs, unsigned int *str_lens, unsigned int nstrs,
               int flags, int *found, int *score)
{
  unsigned int i;
  grn_rc rc;
  if (!q || !strs || !nstrs) { return GRN_INVALID_ARGUMENT; }
  *found = *score = 0;
  if (!q->snip_conds) {
    if ((rc = alloc_snip_conds(ctx, q))) { return rc; }
    flags |= GRN_QUERY_SCAN_ALLOCCONDS;
  } else if (flags & GRN_QUERY_SCAN_ALLOCCONDS) {
    GRN_LOG(ctx, GRN_LOG_WARNING, "invalid flags specified on grn_query_scan");
    return GRN_INVALID_ARGUMENT;
  }
  for (i = 0; i < nstrs; i++) {
    grn_str *n;
    snip_cond *sc = q->snip_conds;
    int f = GRN_STR_WITH_CHECKS | GRN_STR_REMOVEBLANK;
    if (flags & GRN_QUERY_SCAN_NORMALIZE) { f |= GRN_STR_NORMALIZE; }
    n = grn_str_open(ctx, *(strs + i), *(str_lens + i), f);
    if (!n) { return GRN_NO_MEMORY_AVAILABLE; }
    if ((rc = scan_query(ctx, q, n, i + 1, q->expr, &sc, GRN_OP_OR, flags, found, score))) {
      grn_str_close(ctx, n);
      return rc;
    }
    flags &= ~GRN_QUERY_SCAN_ALLOCCONDS;
    grn_str_close(ctx, n);
  }
  return GRN_SUCCESS;
}
示例#2
0
void
test_normalize(gconstpointer data)
{
  const gchar *utf8_expected, *encoded_expected;
  const gchar *utf8_input, *encoded_input;
  grn_str *string;
  const gchar *normalized_text;
  guint normalized_text_len;
  int flags;
  grn_encoding encoding;

  encoding = gcut_data_get_int(data, "encoding");
  GRN_CTX_SET_ENCODING(&context, encoding);
  flags = GRN_STR_NORMALIZE | GRN_STR_WITH_CHECKS | GRN_STR_WITH_CTYPES;
  utf8_input = gcut_data_get_string(data, "input");
  encoded_input = convert_encoding(utf8_input, encoding);
  string = grn_str_open(&context, encoded_input, strlen(encoded_input), flags);
  normalized_text = cut_take_strndup(string->norm, string->norm_blen);
  normalized_text_len = string->norm_blen;
  grn_test_assert(grn_str_close(&context, string));

  utf8_expected = gcut_data_get_string(data, "expected");
  encoded_expected = convert_encoding(utf8_expected, encoding);
  cut_assert_equal_string(encoded_expected, normalized_text);
  cut_assert_equal_int(strlen(encoded_expected), normalized_text_len);
}
示例#3
0
文件: token.c 项目: ryoqun/groonga
static grn_obj *
ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_ngram_tokenizer *token = user_data->ptr;
  grn_str_close(ctx, token->nstr);
  GRN_FREE(token);
  return NULL;
}
示例#4
0
文件: token.c 项目: ikdttr/groonga
static grn_rc
ngram_fin(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data)
{
  grn_ngram_tokenizer *token = user_data->ptr;
  grn_str_close(ctx, token->nstr);
  GRN_FREE(token);
  return GRN_SUCCESS;
}
示例#5
0
文件: token.c 项目: ikdttr/groonga
static grn_rc
mecab_fin(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data)
{
  grn_mecab_tokenizer *token = user_data->ptr;
  // if (token->mecab) { mecab_destroy(token->mecab); }
  grn_str_close(ctx, token->nstr);
  GRN_FREE(token->buf);
  GRN_FREE(token);
  return GRN_SUCCESS;
}
示例#6
0
void
test_normalize_broken(gconstpointer data)
{
  grn_str *string;
  const gchar *input, *encoded_input;
  grn_encoding input_encoding, context_encoding;
  gint input_length;
  int flags = GRN_STR_NORMALIZE | GRN_STR_WITH_CHECKS | GRN_STR_WITH_CTYPES;

  context_encoding = gcut_data_get_int(data, "context-encoding");
  GRN_CTX_SET_ENCODING(&context, context_encoding);

  input = gcut_data_get_string(data, "input");
  input_encoding = gcut_data_get_int(data, "input-encoding");
  input_length = gcut_data_get_int(data, "input-length");
  encoded_input = convert_encoding(input, input_encoding);
  if (input_length < 0) {
    input_length = strlen(encoded_input);
  }
  string = grn_str_open(&context, encoded_input, input_length, flags);
  cut_assert_equal_string("", string->norm);
  cut_assert_equal_int(0, string->norm_blen);
  grn_test_assert(grn_str_close(&context, string));
}
示例#7
0
文件: dat.cpp 项目: mworrell/groonga
int grn_dat_scan(grn_ctx *ctx, grn_dat *dat, const char *str,
                         unsigned int str_size, grn_dat_scan_hit *scan_hits,
                         unsigned int max_num_scan_hits, const char **str_rest) {
  if (!grn_dat_open_trie_if_needed(ctx, dat) || !str ||
      !(dat->obj.header.flags & GRN_OBJ_KEY_VAR_SIZE) || !scan_hits) {
    return -1;
  }

  grn::dat::Trie * const trie = static_cast<grn::dat::Trie *>(dat->trie);
  if (!trie) {
    return -1;
  }

  if (!max_num_scan_hits || !str_size) {
    if (str_rest) {
      *str_rest = str;
    }
    return 0;
  }

  int num_scan_hits = 0;
  try {
    if (dat->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) {
      grn_str * const normalized_str = grn_str_open(
          ctx, str, str_size, GRN_STR_NORMALIZE | GRN_STR_WITH_CHECKS);
      if (!normalized_str) {
        fprintf(stderr, "error: grn_str_open() failed!\n");
        return -1;
      }
      str = normalized_str->norm;
      str_size = normalized_str->norm_blen;
      const short *checks = normalized_str->checks;
      unsigned int offset = 0;
      while (str_size) {
        if (*checks) {
          grn::dat::UInt32 key_pos;
          if (trie->lcp_search(str, str_size, &key_pos)) {
            const grn::dat::Key &key = trie->get_key(key_pos);
            const grn::dat::UInt32 key_length = key.length();
            if ((key_length == str_size) || (checks[key_length])) {
              unsigned int length = 0;
              for (grn::dat::UInt32 i = 0; i < key_length; ++i) {
                if (checks[i] > 0) {
                  length += checks[i];
                }
              }
              scan_hits[num_scan_hits].id = key.id();
              scan_hits[num_scan_hits].offset = offset;
              scan_hits[num_scan_hits].length = length;
              offset += length;
              str += key_length;
              str_size -= key_length;
              checks += key_length;
              if (++num_scan_hits >= max_num_scan_hits) {
                break;
              }
              continue;
            }
          }
          if (*checks > 0) {
            offset += *checks;
          }
        }
        ++str;
        --str_size;
        ++checks;
      }
      if (str_rest) {
        *str_rest = normalized_str->orig + offset;
      }
      grn_str_close(ctx, normalized_str);
    } else {
      const char * const begin = str;
      while (str_size) {
        grn::dat::UInt32 key_pos;
        if (trie->lcp_search(str, str_size, &key_pos)) {
          const grn::dat::Key &key = trie->get_key(key_pos);
          scan_hits[num_scan_hits].id = key.id();
          scan_hits[num_scan_hits].offset = str - begin;
          scan_hits[num_scan_hits].length = key.length();
          str += key.length();
          str_size -= key.length();
          if (++num_scan_hits >= max_num_scan_hits) {
            break;
          }
        } else {
          const int char_length = grn_charlen(ctx, str, str + str_size);
          if (char_length) {
            str += char_length;
            str_size -= char_length;
          } else {
            ++str;
            --str_size;
          }
        }
      }
      if (str_rest) {
        *str_rest = str;
      }
    }
  } catch (const grn::dat::Exception &ex) {
    ERR(grn_dat_translate_error_code(ex.code()),
        "grn::dat::lcp_search failed");
    return -1;
  }
  return num_scan_hits;
}
示例#8
0
文件: mecab.c 项目: firewood/groonga
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_obj *str;
  int nflags = 0;
  char *buf, *p;
  const char *s;
  grn_obj *table = args[0];
  grn_obj_flags table_flags;
  grn_encoding table_encoding;
  grn_mecab_tokenizer *token;
  unsigned int bufsize, len;
  if (!(str = grn_ctx_pop(ctx))) {
    ERR(GRN_INVALID_ARGUMENT, "missing argument");
    return NULL;
  }
  if (!sole_mecab) {
    CRITICAL_SECTION_ENTER(sole_mecab_lock);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        ERR(GRN_TOKENIZER_ERROR, "mecab_new2 failed on grn_mecab_init: %s",
            mecab_strerror(NULL));
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    CRITICAL_SECTION_LEAVE(sole_mecab_lock);
  }
  if (!sole_mecab) {
    return NULL;
  }
  grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL);
  if (table_encoding != sole_mecab_encoding) {
    ERR(GRN_TOKENIZER_ERROR,
        "MeCab dictionary charset (%s) does not match the context encoding: <%s>",
        grn_enctostr(sole_mecab_encoding), grn_enctostr(table_encoding));
    return NULL;
  }
  if (!(token = GRN_MALLOC(sizeof(grn_mecab_tokenizer)))) { return NULL; }
  token->mecab = sole_mecab;
  token->encoding = table_encoding;
  nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
  if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
                                    nflags, token->encoding))) {
    GRN_FREE(token);
    ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
    return NULL;
  }
  len = token->nstr->norm_blen;
  CRITICAL_SECTION_ENTER(sole_mecab_lock);
  s = mecab_sparse_tostr2(token->mecab, token->nstr->norm, len);
  if (!s) {
    ERR(GRN_TOKENIZER_ERROR, "mecab_sparse_tostr failed len=%d err=%s",
        len, mecab_strerror(token->mecab));
  } else {
    bufsize = strlen(s) + 1;
    if (!(buf = GRN_MALLOC(bufsize))) {
      GRN_LOG(ctx, GRN_LOG_ALERT, "buffer allocation on mecab_init failed !");
    } else {
      memcpy(buf, s, bufsize);
    }
  }
  CRITICAL_SECTION_LEAVE(sole_mecab_lock);
  if (!s || !buf) {
    grn_str_close(ctx, token->nstr);
    GRN_FREE(token);
    return NULL;
  }
  /* A certain version of mecab returns trailing lf or spaces. */
  for (p = buf + bufsize - 2;
       buf <= p && isspace(*(unsigned char *)p);
       p--) { *p = '\0'; }
  user_data->ptr = token;
  token->buf = buf;
  token->next = buf;
  token->end = p + 1;
  GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY);
  GRN_UINT32_INIT(&token->stat_, 0);
  return NULL;
}