コード例 #1
0
ファイル: query.c プロジェクト: darashi/groonga
inline static grn_cell *
get_phrase(grn_ctx *ctx, grn_query *q)
{
  char *start, *s, *d;
  start = s = d = q->cur;
  while (1) {
    unsigned int len;
    if (s >= q->str_end) {
      q->cur = s;
      break;
    }
    len = grn_charlen(ctx, s, q->str_end);
    if (len == 0) {
      /* invalid string containing malformed multibyte char */
      return NULL;
    } else if (len == 1) {
      if (*s == GRN_QUERY_QUOTER) {
        q->cur = s + 1;
        break;
      } else if (*s == GRN_QUERY_ESCAPE && s + 1 < q->str_end) {
        s++;
        len = grn_charlen(ctx, s, q->str_end);
      }
    }
    while (len--) { *d++ = *s++; }
  }
  return token_new(q, start, d);
}
コード例 #2
0
ファイル: query.c プロジェクト: darashi/groonga
inline static grn_cell *
get_word(grn_ctx *ctx, grn_query *q, int *prefixp)
{
  char *start = q->cur, *end;
  unsigned int len;
  for (end = q->cur;; ) {
    /* null check and length check */
    if (!(len = grn_charlen(ctx, end, q->str_end))) {
      q->cur = q->str_end;
      break;
    }
    if (grn_isspace(end, q->encoding) ||
        *end == GRN_QUERY_PARENR) {
      q->cur = end;
      break;
    }
    if (*end == GRN_QUERY_PREFIX) {
      *prefixp = 1;
      q->cur = end + 1;
      break;
    }
    end += len;
  }
  return token_new(q, start, end);
}
コード例 #3
0
ファイル: query.c プロジェクト: darashi/groonga
inline static void
skip_space(grn_ctx *ctx, grn_query *q)
{
  unsigned int len;
  while (q->cur < q->str_end && grn_isspace(q->cur, q->encoding)) {
    /* null check and length check */
    if (!(len = grn_charlen(ctx, q->cur, q->str_end))) {
      q->cur = q->str_end;
      break;
    }
    q->cur += len;
  }
}
コード例 #4
0
ファイル: proc_fuzzy_search.c プロジェクト: XLPE/groonga
static uint32_t
calc_edit_distance(grn_ctx *ctx, char *sx, char *ex, char *sy, char *ey, int flags)
{
  int d = 0;
  uint32_t cx, lx, cy, ly, *dists;
  char *px, *py;
  for (px = sx, lx = 0; px < ex && (cx = grn_charlen(ctx, px, ex)); px += cx, lx++);
  for (py = sy, ly = 0; py < ey && (cy = grn_charlen(ctx, py, ey)); py += cy, ly++);
  if ((dists = GRN_PLUGIN_MALLOC(ctx, (lx + 1) * (ly + 1) * sizeof(uint32_t)))) {
    uint32_t x, y;
    for (x = 0; x <= lx; x++) { DIST(x, 0) = x; }
    for (y = 0; y <= ly; y++) { DIST(0, y) = y; }
    for (x = 1, px = sx; x <= lx; x++, px += cx) {
      cx = grn_charlen(ctx, px, ex);
      for (y = 1, py = sy; y <= ly; y++, py += cy) {
        cy = grn_charlen(ctx, py, ey);
        if (cx == cy && !memcmp(px, py, cx)) {
          DIST(x, y) = DIST(x - 1, y - 1);
        } else {
          uint32_t a = DIST(x - 1, y) + 1;
          uint32_t b = DIST(x, y - 1) + 1;
          uint32_t c = DIST(x - 1, y - 1) + 1;
          DIST(x, y) = ((a < b) ? ((a < c) ? a : c) : ((b < c) ? b : c));
          if (flags & GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION &&
              x > 1 && y > 1 && cx == cy &&
              memcmp(px, py - cy, cx) == 0 &&
              memcmp(px - cx, py, cx) == 0) {
            uint32_t t = DIST(x - 2, y - 2) + 1;
            DIST(x, y) = ((DIST(x, y) < t) ? DIST(x, y) : t);
          }
        }
      }
    }
    d = DIST(lx, ly);
    GRN_PLUGIN_FREE(ctx, dists);
  }
  return d;
}
コード例 #5
0
ファイル: test-string.c プロジェクト: mooz/groonga
void
test_charlen_broken(gconstpointer data)
{
  const gchar *input, *encoded_input, *encoded_input_end;
  grn_encoding encoding;
  gint input_length;

  encoding = gcut_data_get_int(data, "encoding");
  GRN_CTX_SET_ENCODING(&context, encoding);

  input = gcut_data_get_string(data, "input");
  input_length = gcut_data_get_int(data, "input-length");
  encoded_input = convert_encoding(input, encoding);
  if (input_length < 0) {
    input_length = strlen(encoded_input);
  }
  encoded_input_end = encoded_input + input_length;
  cut_assert_equal_uint(0, grn_charlen(&context,
                                       encoded_input,
                                       encoded_input_end));
}
コード例 #6
0
ファイル: proc_fuzzy_search.c プロジェクト: XLPE/groonga
static grn_rc
selector_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *index,
                      int nargs, grn_obj **args,
                      grn_obj *res, grn_operator op)
{
  grn_rc rc = GRN_SUCCESS;
  grn_obj *target = NULL;
  grn_obj *obj;
  grn_obj *query;
  uint32_t max_distance = 1;
  uint32_t prefix_length = 0;
  uint32_t prefix_match_size = 0;
  uint32_t max_expansion = 0;
  int flags = 0;
  grn_bool use_sequential_search = GRN_FALSE;

  if ((nargs - 1) < 2) {
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                     "fuzzy_search(): wrong number of arguments (%d ...)",
                     nargs - 1);
    rc = ctx->rc;
    goto exit;
  }
  obj = args[1];
  query = args[2];

  if (nargs == 4) {
    grn_obj *options = args[3];
    grn_hash_cursor *cursor;
    void *key;
    grn_obj *value;
    int key_size;

    if (options->header.type != GRN_TABLE_HASH_KEY) {
      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                       "fuzzy_search(): "
                       "3rd argument must be object literal: <%.*s>",
                       (int)GRN_TEXT_LEN(options),
                       GRN_TEXT_VALUE(options));
      goto exit;
    }

    cursor = grn_hash_cursor_open(ctx, (grn_hash *)options,
                                  NULL, 0, NULL, 0,
                                  0, -1, 0);
    if (!cursor) {
      GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                       "fuzzy_search(): couldn't open cursor");
      goto exit;
    }
    while (grn_hash_cursor_next(ctx, cursor) != GRN_ID_NIL) {
      grn_hash_cursor_get_key_value(ctx, cursor, &key, &key_size,
                                    (void **)&value);

      if (key_size == 12 && !memcmp(key, "max_distance", 12)) {
        max_distance = GRN_UINT32_VALUE(value);
      } else if (key_size == 13 && !memcmp(key, "prefix_length", 13)) {
        prefix_length = GRN_UINT32_VALUE(value);
      } else if (key_size == 13 && !memcmp(key, "max_expansion", 13)) {
        max_expansion = GRN_UINT32_VALUE(value);
      } else if (key_size == 18 && !memcmp(key, "with_transposition", 18)) {
        if (GRN_BOOL_VALUE(value)) {
          flags |= GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION;
        }
      } else {
        GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                         "invalid option name: <%.*s>",
                         key_size, (char *)key);
        grn_hash_cursor_close(ctx, cursor);
        goto exit;
      }
    }
    grn_hash_cursor_close(ctx, cursor);
  }

  if (index) {
    target = index;
  } else {
    if (obj->header.type == GRN_COLUMN_INDEX) {
      target = obj;
    } else {
      grn_column_index(ctx, obj, GRN_OP_FUZZY, &target, 1, NULL);
    }
  }

  if (target) {
    grn_obj *lexicon;
    use_sequential_search = GRN_TRUE;
    lexicon = grn_ctx_at(ctx, target->header.domain);
    if (lexicon) {
      if (lexicon->header.type == GRN_TABLE_PAT_KEY) {
        use_sequential_search = GRN_FALSE;
      }
      grn_obj_unlink(ctx, lexicon);
    }
  } else {
    if (grn_obj_is_key_accessor(ctx, obj) &&
        table->header.type == GRN_TABLE_PAT_KEY) {
      target = table;
    } else {
      use_sequential_search = GRN_TRUE;
    }
  }

  if (prefix_length) {
    const char *s = GRN_TEXT_VALUE(query);
    const char *e = GRN_BULK_CURR(query);
    const char *p;
    unsigned int cl = 0;
    unsigned int length = 0;
    for (p = s; p < e && (cl = grn_charlen(ctx, p, e)); p += cl) {
      length++;
      if (length > prefix_length) {
        break;
      }
    }
    prefix_match_size = p - s;
  }

  if (use_sequential_search) {
    rc = sequential_fuzzy_search(ctx, table, obj, query,
                                 max_distance, prefix_match_size,
                                 max_expansion, flags, res, op);
    goto exit;
  }

  if (!target) {
    grn_obj inspected;
    GRN_TEXT_INIT(&inspected, 0);
    grn_inspect(ctx, &inspected, target);
    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
                     "fuzzy_search(): "
                     "column must be COLUMN_INDEX or TABLE_PAT_KEY: <%.*s>",
                     (int)GRN_TEXT_LEN(&inspected),
                     GRN_TEXT_VALUE(&inspected));
    rc = ctx->rc;
    GRN_OBJ_FIN(ctx, &inspected);
  } else {
    grn_search_optarg options = {0};
    options.mode = GRN_OP_FUZZY;
    options.fuzzy.prefix_match_size = prefix_match_size;
    options.fuzzy.max_distance = max_distance;
    options.fuzzy.max_expansion = max_expansion;
    options.fuzzy.flags = flags;
    grn_obj_search(ctx, target, query, res, op, &options);
  }

exit :
  return rc;
}
コード例 #7
0
ファイル: operator.c プロジェクト: groonga/groonga
static grn_bool
string_have_sub_text(grn_ctx *ctx,
                     const char *text, unsigned int text_len,
                     const char *sub_text, unsigned int sub_text_len)
{
  if (sub_text_len == 0) {
    return GRN_FALSE;
  }

  if (sub_text_len > text_len) {
    return GRN_FALSE;
  }

#ifdef GRN_SUPPORT_REGEXP
  if (grn_onigmo_is_valid_encoding(ctx)) {
    OnigRegex regex;
    grn_bool matched;

    regex = grn_onigmo_new(ctx,
                           sub_text,
                           sub_text_len,
                           GRN_ONIGMO_OPTION_DEFAULT,
                           ONIG_SYNTAX_ASIS,
                           "[operator]");
    if (!regex) {
      return GRN_FALSE;
    }

    matched = regexp_is_match(ctx, regex, text, text_len);
    onig_free(regex);
    return matched;
  }
#endif /* GRN_SUPPORT_REGEXP */
  {
    const char *text_current = text;
    const char *text_end = text + text_len;
    const char *sub_text_current = sub_text;
    const char *sub_text_end = sub_text + sub_text_len;
    int sub_text_start_char_len;
    int sub_text_char_len;

    sub_text_start_char_len = grn_charlen(ctx, sub_text, sub_text_end);
    if (sub_text_start_char_len == 0) {
      return GRN_FALSE;
    }
    sub_text_char_len = sub_text_start_char_len;

    while (text_current < text_end) {
      int text_char_len;

      text_char_len = grn_charlen(ctx, text_current, text_end);
      if (text_char_len == 0) {
        return GRN_FALSE;
      }

      if (text_char_len == sub_text_char_len &&
          memcmp(text_current, sub_text_current, text_char_len) == 0) {
        sub_text_current += sub_text_char_len;
        if (sub_text_current == sub_text_end) {
          return GRN_TRUE;
        }

        sub_text_char_len = grn_charlen(ctx, sub_text_current, sub_text_end);
        if (sub_text_char_len == 0) {
          return GRN_FALSE;
        }
      } else {
        if (sub_text_current != sub_text) {
          sub_text_current = sub_text;
          sub_text_char_len = sub_text_start_char_len;
          continue;
        }
      }

      text_current += text_char_len;
    }

    return GRN_FALSE;
  }
}
コード例 #8
0
static grn_string *
grn_fake_string_open(grn_ctx *ctx, grn_string *string)
{
  /* TODO: support GRN_STRING_REMOVE_BLANK flag and ctypes */
  grn_string *nstr = string;
  const char *str;
  unsigned int str_len;

  str = nstr->original;
  str_len = nstr->original_length_in_bytes;

  if (!(nstr->normalized = GRN_MALLOC(str_len + 1))) {
    ERR(GRN_NO_MEMORY_AVAILABLE,
        "[strinig][fake] failed to allocate normalized text space");
    grn_string_close(ctx, (grn_obj *)nstr);
    return NULL;
  }

  if (nstr->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER &&
      ctx->encoding == GRN_ENC_UTF8) {
    int char_length;
    const char *source_current = str;
    const char *source_end = str + str_len;
    char *destination = nstr->normalized;
    unsigned int destination_length = 0;
    while ((char_length = grn_charlen(ctx, source_current, source_end)) > 0) {
      if (!grn_tokenizer_is_tokenized_delimiter(ctx,
                                                source_current, char_length,
                                                ctx->encoding)) {
        memcpy(destination, source_current, char_length);
        destination += char_length;
        destination_length += char_length;
      }
      source_current += char_length;
    }
    nstr->normalized[destination_length] = '\0';
    nstr->normalized_length_in_bytes = destination_length;
  } else {
    memcpy(nstr->normalized, str, str_len);
    nstr->normalized[str_len] = '\0';
    nstr->normalized_length_in_bytes = str_len;
  }

  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
    int16_t f = 0;
    unsigned char c;
    size_t i;
    if (!(nstr->checks = (int16_t *) GRN_MALLOC(sizeof(int16_t) * str_len))) {
      grn_string_close(ctx, (grn_obj *)nstr);
      ERR(GRN_NO_MEMORY_AVAILABLE,
          "[strinig][fake] failed to allocate checks space");
      return NULL;
    }
    switch (nstr->encoding) {
    case GRN_ENC_EUC_JP:
      for (i = 0; i < str_len; i++) {
        if (!f) {
          c = (unsigned char) str[i];
          f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1)
            );
          nstr->checks[i] = f;
        } else {
          nstr->checks[i] = 0;
        }
        f--;
      }
      break;
    case GRN_ENC_SJIS:
      for (i = 0; i < str_len; i++) {
        if (!f) {
          c = (unsigned char) str[i];
          f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1);
          nstr->checks[i] = f;
        } else {
          nstr->checks[i] = 0;
        }
        f--;
      }
      break;
    case GRN_ENC_UTF8:
      for (i = 0; i < str_len; i++) {
        if (!f) {
          c = (unsigned char) str[i];
          f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3)
                           : 2)
               : 1);
          nstr->checks[i] = f;
        } else {
          nstr->checks[i] = 0;
        }
        f--;
      }
      break;
    default:
      for (i = 0; i < str_len; i++) {
        nstr->checks[i] = 1;
      }
      break;
    }
  }
  return nstr;
}
コード例 #9
0
ファイル: load.c プロジェクト: cosmo0920/groonga
static void
json_read(grn_ctx *ctx, grn_loader *loader, const char *str, unsigned int str_len)
{
  const char *const beg = str;
  char c;
  int len;
  const char *se = str + str_len;
  while (str < se) {
    c = *str;
    switch (loader->stat) {
    case GRN_LOADER_BEGIN :
      if ((len = grn_isspace(str, ctx->encoding))) {
        str += len;
        continue;
      }
      switch (c) {
      case '[' :
        JSON_READ_OPEN_BRACKET();
        break;
      case '{' :
        JSON_READ_OPEN_BRACE();
        break;
      default :
        ERR(GRN_INVALID_ARGUMENT,
            "JSON must start with '[' or '{': <%.*s>", str_len, beg);
        loader->stat = GRN_LOADER_END;
        break;
      }
      break;
    case GRN_LOADER_TOKEN :
      if ((len = grn_isspace(str, ctx->encoding))) {
        str += len;
        continue;
      }
      switch (c) {
      case '"' :
        loader->stat = GRN_LOADER_STRING;
        values_add(ctx, loader);
        str++;
        break;
      case '[' :
        JSON_READ_OPEN_BRACKET();
        break;
      case '{' :
        JSON_READ_OPEN_BRACE();
        break;
      case ':' :
        str++;
        break;
      case ',' :
        str++;
        break;
      case ']' :
        bracket_close(ctx, loader);
        loader->stat = GRN_BULK_VSIZE(&loader->level) ? GRN_LOADER_TOKEN : GRN_LOADER_END;
        if (ctx->rc == GRN_CANCEL) {
          loader->stat = GRN_LOADER_END;
        }
        str++;
        break;
      case '}' :
        brace_close(ctx, loader);
        loader->stat = GRN_BULK_VSIZE(&loader->level) ? GRN_LOADER_TOKEN : GRN_LOADER_END;
        if (ctx->rc == GRN_CANCEL) {
          loader->stat = GRN_LOADER_END;
        }
        str++;
        break;
      case '+' : case '-' : case '0' : case '1' : case '2' : case '3' :
      case '4' : case '5' : case '6' : case '7' : case '8' : case '9' :
        loader->stat = GRN_LOADER_NUMBER;
        values_add(ctx, loader);
        break;
      default :
        if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || ('_' == c)) {
          loader->stat = GRN_LOADER_SYMBOL;
          values_add(ctx, loader);
        } else {
          if ((len = grn_charlen(ctx, str, se))) {
            GRN_LOG(ctx, GRN_LOG_ERROR, "ignored invalid char('%c') at", c);
            GRN_LOG(ctx, GRN_LOG_ERROR, "%.*s", (int)(str - beg) + len, beg);
            GRN_LOG(ctx, GRN_LOG_ERROR, "%*s", (int)(str - beg) + 1, "^");
            str += len;
          } else {
            GRN_LOG(ctx, GRN_LOG_ERROR, "ignored invalid char(\\x%.2x) after", c);
            GRN_LOG(ctx, GRN_LOG_ERROR, "%.*s", (int)(str - beg), beg);
            str = se;
          }
        }
        break;
      }
      break;
    case GRN_LOADER_SYMBOL :
      if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') ||
          ('0' <= c && c <= '9') || ('_' == c)) {
        GRN_TEXT_PUTC(ctx, loader->last, c);
        str++;
      } else {
        char *v = GRN_TEXT_VALUE(loader->last);
        switch (*v) {
        case 'n' :
          if (GRN_TEXT_LEN(loader->last) == 4 && !memcmp(v, "null", 4)) {
            loader->last->header.domain = GRN_DB_VOID;
            GRN_BULK_REWIND(loader->last);
          }
          break;
        case 't' :
          if (GRN_TEXT_LEN(loader->last) == 4 && !memcmp(v, "true", 4)) {
            loader->last->header.domain = GRN_DB_BOOL;
            GRN_BOOL_SET(ctx, loader->last, GRN_TRUE);
          }
          break;
        case 'f' :
          if (GRN_TEXT_LEN(loader->last) == 5 && !memcmp(v, "false", 5)) {
            loader->last->header.domain = GRN_DB_BOOL;
            GRN_BOOL_SET(ctx, loader->last, GRN_FALSE);
          }
          break;
        default :
          break;
        }
        loader->stat = GRN_BULK_VSIZE(&loader->level) ? GRN_LOADER_TOKEN : GRN_LOADER_END;
      }
      break;
    case GRN_LOADER_NUMBER :
      switch (c) {
      case '+' : case '-' : case '.' : case 'e' : case 'E' :
      case '0' : case '1' : case '2' : case '3' : case '4' :
      case '5' : case '6' : case '7' : case '8' : case '9' :
        GRN_TEXT_PUTC(ctx, loader->last, c);
        str++;
        break;
      default :
        {
          const char *cur, *str = GRN_BULK_HEAD(loader->last);
          const char *str_end = GRN_BULK_CURR(loader->last);
          int64_t i = grn_atoll(str, str_end, &cur);
          if (cur == str_end) {
            loader->last->header.domain = GRN_DB_INT64;
            GRN_INT64_SET(ctx, loader->last, i);
          } else if (cur != str) {
            uint64_t i = grn_atoull(str, str_end, &cur);
            if (cur == str_end) {
              loader->last->header.domain = GRN_DB_UINT64;
              GRN_UINT64_SET(ctx, loader->last, i);
            } else if (cur != str) {
              double d;
              char *end;
              grn_obj buf;
              GRN_TEXT_INIT(&buf, 0);
              GRN_TEXT_PUT(ctx, &buf, str, GRN_BULK_VSIZE(loader->last));
              GRN_TEXT_PUTC(ctx, &buf, '\0');
              errno = 0;
              d = strtod(GRN_TEXT_VALUE(&buf), &end);
              if (!errno && end + 1 == GRN_BULK_CURR(&buf)) {
                loader->last->header.domain = GRN_DB_FLOAT;
                GRN_FLOAT_SET(ctx, loader->last, d);
              }
              GRN_OBJ_FIN(ctx, &buf);
            }
          }
        }
        loader->stat = GRN_BULK_VSIZE(&loader->level) ? GRN_LOADER_TOKEN : GRN_LOADER_END;
        break;
      }
      break;
    case GRN_LOADER_STRING :
      switch (c) {
      case '\\' :
        loader->stat = GRN_LOADER_STRING_ESC;
        str++;
        break;
      case '"' :
        str++;
        loader->stat = GRN_BULK_VSIZE(&loader->level) ? GRN_LOADER_TOKEN : GRN_LOADER_END;
        /*
        *(GRN_BULK_CURR(loader->last)) = '\0';
        GRN_LOG(ctx, GRN_LOG_ALERT, "read str(%s)", GRN_TEXT_VALUE(loader->last));
        */
        break;
      default :
        if ((len = grn_charlen(ctx, str, se))) {
          GRN_TEXT_PUT(ctx, loader->last, str, len);
          str += len;
        } else {
          GRN_LOG(ctx, GRN_LOG_ERROR, "ignored invalid char(\\x%.2x) after", c);
          GRN_LOG(ctx, GRN_LOG_ERROR, "%.*s", (int)(str - beg), beg);
          str = se;
        }
        break;
      }
      break;
    case GRN_LOADER_STRING_ESC :
      switch (c) {
      case 'b' :
        GRN_TEXT_PUTC(ctx, loader->last, '\b');
        loader->stat = GRN_LOADER_STRING;
        break;
      case 'f' :
        GRN_TEXT_PUTC(ctx, loader->last, '\f');
        loader->stat = GRN_LOADER_STRING;
        break;
      case 'n' :
        GRN_TEXT_PUTC(ctx, loader->last, '\n');
        loader->stat = GRN_LOADER_STRING;
        break;
      case 'r' :
        GRN_TEXT_PUTC(ctx, loader->last, '\r');
        loader->stat = GRN_LOADER_STRING;
        break;
      case 't' :
        GRN_TEXT_PUTC(ctx, loader->last, '\t');
        loader->stat = GRN_LOADER_STRING;
        break;
      case 'u' :
        loader->stat = GRN_LOADER_UNICODE0;
        break;
      default :
        GRN_TEXT_PUTC(ctx, loader->last, c);
        loader->stat = GRN_LOADER_STRING;
        break;
      }
      str++;
      break;
    case GRN_LOADER_UNICODE0 :
      switch (c) {
      case '0' : case '1' : case '2' : case '3' : case '4' :
      case '5' : case '6' : case '7' : case '8' : case '9' :
        loader->unichar = (c - '0') * 0x1000;
        break;
      case 'a' : case 'b' : case 'c' : case 'd' : case 'e' : case 'f' :
        loader->unichar = (c - 'a' + 10) * 0x1000;
        break;
      case 'A' : case 'B' : case 'C' : case 'D' : case 'E' : case 'F' :
        loader->unichar = (c - 'A' + 10) * 0x1000;
        break;
      default :
        ;// todo : error
      }
      loader->stat = GRN_LOADER_UNICODE1;
      str++;
      break;
    case GRN_LOADER_UNICODE1 :
      switch (c) {
      case '0' : case '1' : case '2' : case '3' : case '4' :
      case '5' : case '6' : case '7' : case '8' : case '9' :
        loader->unichar += (c - '0') * 0x100;
        break;
      case 'a' : case 'b' : case 'c' : case 'd' : case 'e' : case 'f' :
        loader->unichar += (c - 'a' + 10) * 0x100;
        break;
      case 'A' : case 'B' : case 'C' : case 'D' : case 'E' : case 'F' :
        loader->unichar += (c - 'A' + 10) * 0x100;
        break;
      default :
        ;// todo : error
      }
      loader->stat = GRN_LOADER_UNICODE2;
      str++;
      break;
    case GRN_LOADER_UNICODE2 :
      switch (c) {
      case '0' : case '1' : case '2' : case '3' : case '4' :
      case '5' : case '6' : case '7' : case '8' : case '9' :
        loader->unichar += (c - '0') * 0x10;
        break;
      case 'a' : case 'b' : case 'c' : case 'd' : case 'e' : case 'f' :
        loader->unichar += (c - 'a' + 10) * 0x10;
        break;
      case 'A' : case 'B' : case 'C' : case 'D' : case 'E' : case 'F' :
        loader->unichar += (c - 'A' + 10) * 0x10;
        break;
      default :
        ;// todo : error
      }
      loader->stat = GRN_LOADER_UNICODE3;
      str++;
      break;
    case GRN_LOADER_UNICODE3 :
      switch (c) {
      case '0' : case '1' : case '2' : case '3' : case '4' :
      case '5' : case '6' : case '7' : case '8' : case '9' :
        loader->unichar += (c - '0');
        break;
      case 'a' : case 'b' : case 'c' : case 'd' : case 'e' : case 'f' :
        loader->unichar += (c - 'a' + 10);
        break;
      case 'A' : case 'B' : case 'C' : case 'D' : case 'E' : case 'F' :
        loader->unichar += (c - 'A' + 10);
        break;
      default :
        ;// todo : error
      }
      {
        uint32_t u = loader->unichar;
        if (u >= 0xd800 && u <= 0xdbff) { /* High-surrogate code points */
          loader->unichar_hi = u;
          loader->stat = GRN_LOADER_STRING;
          str++;
          break;
        }
        if (u >= 0xdc00 && u <= 0xdfff) { /* Low-surrogate code points */
          u = 0x10000 + (loader->unichar_hi - 0xd800) * 0x400 + u - 0xdc00;
        }
        if (u < 0x80) {
          GRN_TEXT_PUTC(ctx, loader->last, u);
        } else {
          if (u < 0x800) {
            GRN_TEXT_PUTC(ctx, loader->last, (u >> 6) | 0xc0);
          } else {
            if (u < 0x10000) {
              GRN_TEXT_PUTC(ctx, loader->last, (u >> 12) | 0xe0);
            } else {
              GRN_TEXT_PUTC(ctx, loader->last, (u >> 18) | 0xf0);
              GRN_TEXT_PUTC(ctx, loader->last, ((u >> 12) & 0x3f) | 0x80);
            }
            GRN_TEXT_PUTC(ctx, loader->last, ((u >> 6) & 0x3f) | 0x80);
          }
          GRN_TEXT_PUTC(ctx, loader->last, (u & 0x3f) | 0x80);
        }
コード例 #10
0
ファイル: dat.cpp プロジェクト: mworrell/groonga
int grn_dat_scan(grn_ctx *ctx, grn_dat *dat, const char *str,
                         unsigned int str_size, grn_dat_scan_hit *scan_hits,
                         unsigned int max_num_scan_hits, const char **str_rest) {
  if (!grn_dat_open_trie_if_needed(ctx, dat) || !str ||
      !(dat->obj.header.flags & GRN_OBJ_KEY_VAR_SIZE) || !scan_hits) {
    return -1;
  }

  grn::dat::Trie * const trie = static_cast<grn::dat::Trie *>(dat->trie);
  if (!trie) {
    return -1;
  }

  if (!max_num_scan_hits || !str_size) {
    if (str_rest) {
      *str_rest = str;
    }
    return 0;
  }

  int num_scan_hits = 0;
  try {
    if (dat->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) {
      grn_str * const normalized_str = grn_str_open(
          ctx, str, str_size, GRN_STR_NORMALIZE | GRN_STR_WITH_CHECKS);
      if (!normalized_str) {
        fprintf(stderr, "error: grn_str_open() failed!\n");
        return -1;
      }
      str = normalized_str->norm;
      str_size = normalized_str->norm_blen;
      const short *checks = normalized_str->checks;
      unsigned int offset = 0;
      while (str_size) {
        if (*checks) {
          grn::dat::UInt32 key_pos;
          if (trie->lcp_search(str, str_size, &key_pos)) {
            const grn::dat::Key &key = trie->get_key(key_pos);
            const grn::dat::UInt32 key_length = key.length();
            if ((key_length == str_size) || (checks[key_length])) {
              unsigned int length = 0;
              for (grn::dat::UInt32 i = 0; i < key_length; ++i) {
                if (checks[i] > 0) {
                  length += checks[i];
                }
              }
              scan_hits[num_scan_hits].id = key.id();
              scan_hits[num_scan_hits].offset = offset;
              scan_hits[num_scan_hits].length = length;
              offset += length;
              str += key_length;
              str_size -= key_length;
              checks += key_length;
              if (++num_scan_hits >= max_num_scan_hits) {
                break;
              }
              continue;
            }
          }
          if (*checks > 0) {
            offset += *checks;
          }
        }
        ++str;
        --str_size;
        ++checks;
      }
      if (str_rest) {
        *str_rest = normalized_str->orig + offset;
      }
      grn_str_close(ctx, normalized_str);
    } else {
      const char * const begin = str;
      while (str_size) {
        grn::dat::UInt32 key_pos;
        if (trie->lcp_search(str, str_size, &key_pos)) {
          const grn::dat::Key &key = trie->get_key(key_pos);
          scan_hits[num_scan_hits].id = key.id();
          scan_hits[num_scan_hits].offset = str - begin;
          scan_hits[num_scan_hits].length = key.length();
          str += key.length();
          str_size -= key.length();
          if (++num_scan_hits >= max_num_scan_hits) {
            break;
          }
        } else {
          const int char_length = grn_charlen(ctx, str, str + str_size);
          if (char_length) {
            str += char_length;
            str_size -= char_length;
          } else {
            ++str;
            --str_size;
          }
        }
      }
      if (str_rest) {
        *str_rest = str;
      }
    }
  } catch (const grn::dat::Exception &ex) {
    ERR(grn_dat_translate_error_code(ex.code()),
        "grn::dat::lcp_search failed");
    return -1;
  }
  return num_scan_hits;
}