Esempio n. 1
0
static grn_rc
mecab_next(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data)
{
  size_t cl;
  grn_mecab_tokenizer *token = user_data->ptr;
  const unsigned char *p = token->next, *r;
  const unsigned char *e = token->end;
  for (r = p; r < e; r += cl) {
    if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
      token->next = (unsigned char *)e;
      break;
    }
    if (grn_isspace((const char *)r, token->encoding)) {
      const unsigned char *q = r;
      while ((cl = grn_isspace((const char *)q, token->encoding))) { q += cl; }
      token->next = (unsigned char *)q;
      break;
    }
  }
  GRN_TEXT_SET_REF(&token->curr_, p, r - p);
  GRN_UINT32_SET(ctx, &token->stat_, r == e ? GRN_TOKEN_LAST : 0);
  grn_ctx_push(ctx, &token->curr_);
  grn_ctx_push(ctx, &token->stat_);
  return GRN_SUCCESS;
}
Esempio n. 2
0
/*
  This function returns tokens one by one.
 */
static grn_obj *
mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  size_t cl;
  /* grn_obj *table = args[0]; */
  grn_mecab_tokenizer *token = user_data->ptr;
  char *p = token->next, *r;
  char *e = token->end;
  for (r = p; r < e; r += cl) {
    if (!(cl = grn_charlen_(ctx, r, e, token->encoding))) {
      token->next = e;
      break;
    }
    if (grn_isspace(r, token->encoding)) {
      char *q = r;
      while ((cl = grn_isspace(q, token->encoding))) { q += cl; }
      token->next = q;
      break;
    }
  }
  GRN_TEXT_SET_REF(&token->curr_, p, r - p);
  GRN_UINT32_SET(ctx, &token->stat_, r == e ? GRN_TOKEN_LAST : 0);
  grn_ctx_push(ctx, &token->curr_);
  grn_ctx_push(ctx, &token->stat_);
  return NULL;
}
Esempio n. 3
0
/*
  This function returns tokens one by one.
 */
static grn_obj *
mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  /* grn_obj *table = args[0]; */
  grn_mecab_tokenizer *tokenizer = user_data->ptr;
  grn_encoding encoding = tokenizer->query->encoding;

  if (tokenizer->query->have_tokenized_delimiter) {
    tokenizer->next =
      grn_tokenizer_tokenized_delimiter_next(ctx,
                                             &(tokenizer->token),
                                             tokenizer->next,
                                             tokenizer->end - tokenizer->next,
                                             encoding);
  } else {
    size_t cl;
    const char *p = tokenizer->next, *r;
    const char *e = tokenizer->end;
    grn_tokenizer_status status;

    for (r = p; r < e; r += cl) {
      int space_len;

      space_len = grn_isspace(r, encoding);
      if (space_len > 0 && r == p) {
        cl = space_len;
        p = r + cl;
        continue;
      }

      if (!(cl = grn_charlen_(ctx, r, e, encoding))) {
        tokenizer->next = e;
        break;
      }

      if (space_len > 0) {
        const char *q = r + space_len;
        while (q < e && (space_len = grn_isspace(q, encoding))) {
          q += space_len;
        }
        tokenizer->next = q;
        break;
      }
    }

    if (r == e || tokenizer->next == e) {
      status = GRN_TOKENIZER_LAST;
    } else {
      status = GRN_TOKENIZER_CONTINUE;
    }
    grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status);
  }

  return NULL;
}
Esempio n. 4
0
inline static grn_cell *
get_word(grn_ctx *ctx, grn_query *q, int *prefixp)
{
  char *start = q->cur, *end;
  unsigned int len;
  for (end = q->cur;; ) {
    /* null check and length check */
    if (!(len = grn_charlen(ctx, end, q->str_end))) {
      q->cur = q->str_end;
      break;
    }
    if (grn_isspace(end, q->encoding) ||
        *end == GRN_QUERY_PARENR) {
      q->cur = end;
      break;
    }
    if (*end == GRN_QUERY_PREFIX) {
      *prefixp = 1;
      q->cur = end + 1;
      break;
    }
    end += len;
  }
  return token_new(q, start, end);
}
Esempio n. 5
0
inline static void
skip_space(grn_ctx *ctx, grn_query *q)
{
  unsigned int len;
  while (q->cur < q->str_end && grn_isspace(q->cur, q->encoding)) {
    /* null check and length check */
    if (!(len = grn_charlen(ctx, q->cur, q->str_end))) {
      q->cur = q->str_end;
      break;
    }
    q->cur += len;
  }
}
Esempio n. 6
0
inline static void
get_pragma(grn_ctx *ctx, grn_query *q)
{
  char *start, *end = q->cur;
  while (end < q->str_end && *end == GRN_QUERY_PREFIX) {
    if (++end >= q->str_end) { break; }
    switch (*end) {
    case 'E' :
      start = ++end;
      q->escalation_threshold = grn_atoi(start, q->str_end, (const char **)&end);
      while (end < q->str_end && (isdigit(*end) || *end == '-')) { end++; }
      if (*end == ',') {
        start = ++end;
        q->escalation_decaystep = grn_atoi(start, q->str_end, (const char **)&end);
      }
      q->cur = end;
      break;
    case 'D' :
      start = ++end;
      while (end < q->str_end && *end != GRN_QUERY_PREFIX && !grn_isspace(end, q->encoding)) {
        end++;
      }
      if (end > start) {
        switch (*start) {
        case 'O' :
          q->default_op = GRN_OP_OR;
          break;
        case GRN_QUERY_AND :
          q->default_op = GRN_OP_AND;
          break;
        case GRN_QUERY_BUT :
          q->default_op = GRN_OP_BUT;
          break;
        case GRN_QUERY_ADJ_INC :
          q->default_op = GRN_OP_ADJUST;
          break;
        }
      }
      q->cur = end;
      break;
    case 'W' :
      start = ++end;
      end = (char *)get_weight_vector(ctx, q, start);
      q->cur = end;
      break;
    }
  }
}
Esempio n. 7
0
static void
json_read(grn_ctx *ctx, grn_loader *loader, const char *str, unsigned int str_len)
{
  const char *const beg = str;
  char c;
  int len;
  const char *se = str + str_len;
  while (str < se) {
    c = *str;
    switch (loader->stat) {
    case GRN_LOADER_BEGIN :
      if ((len = grn_isspace(str, ctx->encoding))) {
        str += len;
        continue;
      }
      switch (c) {
      case '[' :
        JSON_READ_OPEN_BRACKET();
        break;
      case '{' :
        JSON_READ_OPEN_BRACE();
        break;
      default :
        ERR(GRN_INVALID_ARGUMENT,
            "JSON must start with '[' or '{': <%.*s>", str_len, beg);
        loader->stat = GRN_LOADER_END;
        break;
      }
      break;
    case GRN_LOADER_TOKEN :
      if ((len = grn_isspace(str, ctx->encoding))) {
        str += len;
        continue;
      }
      switch (c) {
      case '"' :
        loader->stat = GRN_LOADER_STRING;
        values_add(ctx, loader);
        str++;
        break;
      case '[' :
        JSON_READ_OPEN_BRACKET();
        break;
      case '{' :
        JSON_READ_OPEN_BRACE();
        break;
      case ':' :
        str++;
        break;
      case ',' :
        str++;
        break;
      case ']' :
        bracket_close(ctx, loader);
        loader->stat = GRN_BULK_VSIZE(&loader->level) ? GRN_LOADER_TOKEN : GRN_LOADER_END;
        if (ctx->rc == GRN_CANCEL) {
          loader->stat = GRN_LOADER_END;
        }
        str++;
        break;
      case '}' :
        brace_close(ctx, loader);
        loader->stat = GRN_BULK_VSIZE(&loader->level) ? GRN_LOADER_TOKEN : GRN_LOADER_END;
        if (ctx->rc == GRN_CANCEL) {
          loader->stat = GRN_LOADER_END;
        }
        str++;
        break;
      case '+' : case '-' : case '0' : case '1' : case '2' : case '3' :
      case '4' : case '5' : case '6' : case '7' : case '8' : case '9' :
        loader->stat = GRN_LOADER_NUMBER;
        values_add(ctx, loader);
        break;
      default :
        if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || ('_' == c)) {
          loader->stat = GRN_LOADER_SYMBOL;
          values_add(ctx, loader);
        } else {
          if ((len = grn_charlen(ctx, str, se))) {
            GRN_LOG(ctx, GRN_LOG_ERROR, "ignored invalid char('%c') at", c);
            GRN_LOG(ctx, GRN_LOG_ERROR, "%.*s", (int)(str - beg) + len, beg);
            GRN_LOG(ctx, GRN_LOG_ERROR, "%*s", (int)(str - beg) + 1, "^");
            str += len;
          } else {
            GRN_LOG(ctx, GRN_LOG_ERROR, "ignored invalid char(\\x%.2x) after", c);
            GRN_LOG(ctx, GRN_LOG_ERROR, "%.*s", (int)(str - beg), beg);
            str = se;
          }
        }
        break;
      }
      break;
    case GRN_LOADER_SYMBOL :
      if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') ||
          ('0' <= c && c <= '9') || ('_' == c)) {
        GRN_TEXT_PUTC(ctx, loader->last, c);
        str++;
      } else {
        char *v = GRN_TEXT_VALUE(loader->last);
        switch (*v) {
        case 'n' :
          if (GRN_TEXT_LEN(loader->last) == 4 && !memcmp(v, "null", 4)) {
            loader->last->header.domain = GRN_DB_VOID;
            GRN_BULK_REWIND(loader->last);
          }
          break;
        case 't' :
          if (GRN_TEXT_LEN(loader->last) == 4 && !memcmp(v, "true", 4)) {
            loader->last->header.domain = GRN_DB_BOOL;
            GRN_BOOL_SET(ctx, loader->last, GRN_TRUE);
          }
          break;
        case 'f' :
          if (GRN_TEXT_LEN(loader->last) == 5 && !memcmp(v, "false", 5)) {
            loader->last->header.domain = GRN_DB_BOOL;
            GRN_BOOL_SET(ctx, loader->last, GRN_FALSE);
          }
          break;
        default :
          break;
        }
        loader->stat = GRN_BULK_VSIZE(&loader->level) ? GRN_LOADER_TOKEN : GRN_LOADER_END;
      }
      break;
    case GRN_LOADER_NUMBER :
      switch (c) {
      case '+' : case '-' : case '.' : case 'e' : case 'E' :
      case '0' : case '1' : case '2' : case '3' : case '4' :
      case '5' : case '6' : case '7' : case '8' : case '9' :
        GRN_TEXT_PUTC(ctx, loader->last, c);
        str++;
        break;
      default :
        {
          const char *cur, *str = GRN_BULK_HEAD(loader->last);
          const char *str_end = GRN_BULK_CURR(loader->last);
          int64_t i = grn_atoll(str, str_end, &cur);
          if (cur == str_end) {
            loader->last->header.domain = GRN_DB_INT64;
            GRN_INT64_SET(ctx, loader->last, i);
          } else if (cur != str) {
            uint64_t i = grn_atoull(str, str_end, &cur);
            if (cur == str_end) {
              loader->last->header.domain = GRN_DB_UINT64;
              GRN_UINT64_SET(ctx, loader->last, i);
            } else if (cur != str) {
              double d;
              char *end;
              grn_obj buf;
              GRN_TEXT_INIT(&buf, 0);
              GRN_TEXT_PUT(ctx, &buf, str, GRN_BULK_VSIZE(loader->last));
              GRN_TEXT_PUTC(ctx, &buf, '\0');
              errno = 0;
              d = strtod(GRN_TEXT_VALUE(&buf), &end);
              if (!errno && end + 1 == GRN_BULK_CURR(&buf)) {
                loader->last->header.domain = GRN_DB_FLOAT;
                GRN_FLOAT_SET(ctx, loader->last, d);
              }
              GRN_OBJ_FIN(ctx, &buf);
            }
          }
        }
        loader->stat = GRN_BULK_VSIZE(&loader->level) ? GRN_LOADER_TOKEN : GRN_LOADER_END;
        break;
      }
      break;
    case GRN_LOADER_STRING :
      switch (c) {
      case '\\' :
        loader->stat = GRN_LOADER_STRING_ESC;
        str++;
        break;
      case '"' :
        str++;
        loader->stat = GRN_BULK_VSIZE(&loader->level) ? GRN_LOADER_TOKEN : GRN_LOADER_END;
        /*
        *(GRN_BULK_CURR(loader->last)) = '\0';
        GRN_LOG(ctx, GRN_LOG_ALERT, "read str(%s)", GRN_TEXT_VALUE(loader->last));
        */
        break;
      default :
        if ((len = grn_charlen(ctx, str, se))) {
          GRN_TEXT_PUT(ctx, loader->last, str, len);
          str += len;
        } else {
          GRN_LOG(ctx, GRN_LOG_ERROR, "ignored invalid char(\\x%.2x) after", c);
          GRN_LOG(ctx, GRN_LOG_ERROR, "%.*s", (int)(str - beg), beg);
          str = se;
        }
        break;
      }
      break;
    case GRN_LOADER_STRING_ESC :
      switch (c) {
      case 'b' :
        GRN_TEXT_PUTC(ctx, loader->last, '\b');
        loader->stat = GRN_LOADER_STRING;
        break;
      case 'f' :
        GRN_TEXT_PUTC(ctx, loader->last, '\f');
        loader->stat = GRN_LOADER_STRING;
        break;
      case 'n' :
        GRN_TEXT_PUTC(ctx, loader->last, '\n');
        loader->stat = GRN_LOADER_STRING;
        break;
      case 'r' :
        GRN_TEXT_PUTC(ctx, loader->last, '\r');
        loader->stat = GRN_LOADER_STRING;
        break;
      case 't' :
        GRN_TEXT_PUTC(ctx, loader->last, '\t');
        loader->stat = GRN_LOADER_STRING;
        break;
      case 'u' :
        loader->stat = GRN_LOADER_UNICODE0;
        break;
      default :
        GRN_TEXT_PUTC(ctx, loader->last, c);
        loader->stat = GRN_LOADER_STRING;
        break;
      }
      str++;
      break;
    case GRN_LOADER_UNICODE0 :
      switch (c) {
      case '0' : case '1' : case '2' : case '3' : case '4' :
      case '5' : case '6' : case '7' : case '8' : case '9' :
        loader->unichar = (c - '0') * 0x1000;
        break;
      case 'a' : case 'b' : case 'c' : case 'd' : case 'e' : case 'f' :
        loader->unichar = (c - 'a' + 10) * 0x1000;
        break;
      case 'A' : case 'B' : case 'C' : case 'D' : case 'E' : case 'F' :
        loader->unichar = (c - 'A' + 10) * 0x1000;
        break;
      default :
        ;// todo : error
      }
      loader->stat = GRN_LOADER_UNICODE1;
      str++;
      break;
    case GRN_LOADER_UNICODE1 :
      switch (c) {
      case '0' : case '1' : case '2' : case '3' : case '4' :
      case '5' : case '6' : case '7' : case '8' : case '9' :
        loader->unichar += (c - '0') * 0x100;
        break;
      case 'a' : case 'b' : case 'c' : case 'd' : case 'e' : case 'f' :
        loader->unichar += (c - 'a' + 10) * 0x100;
        break;
      case 'A' : case 'B' : case 'C' : case 'D' : case 'E' : case 'F' :
        loader->unichar += (c - 'A' + 10) * 0x100;
        break;
      default :
        ;// todo : error
      }
      loader->stat = GRN_LOADER_UNICODE2;
      str++;
      break;
    case GRN_LOADER_UNICODE2 :
      switch (c) {
      case '0' : case '1' : case '2' : case '3' : case '4' :
      case '5' : case '6' : case '7' : case '8' : case '9' :
        loader->unichar += (c - '0') * 0x10;
        break;
      case 'a' : case 'b' : case 'c' : case 'd' : case 'e' : case 'f' :
        loader->unichar += (c - 'a' + 10) * 0x10;
        break;
      case 'A' : case 'B' : case 'C' : case 'D' : case 'E' : case 'F' :
        loader->unichar += (c - 'A' + 10) * 0x10;
        break;
      default :
        ;// todo : error
      }
      loader->stat = GRN_LOADER_UNICODE3;
      str++;
      break;
    case GRN_LOADER_UNICODE3 :
      switch (c) {
      case '0' : case '1' : case '2' : case '3' : case '4' :
      case '5' : case '6' : case '7' : case '8' : case '9' :
        loader->unichar += (c - '0');
        break;
      case 'a' : case 'b' : case 'c' : case 'd' : case 'e' : case 'f' :
        loader->unichar += (c - 'a' + 10);
        break;
      case 'A' : case 'B' : case 'C' : case 'D' : case 'E' : case 'F' :
        loader->unichar += (c - 'A' + 10);
        break;
      default :
        ;// todo : error
      }
      {
        uint32_t u = loader->unichar;
        if (u >= 0xd800 && u <= 0xdbff) { /* High-surrogate code points */
          loader->unichar_hi = u;
          loader->stat = GRN_LOADER_STRING;
          str++;
          break;
        }
        if (u >= 0xdc00 && u <= 0xdfff) { /* Low-surrogate code points */
          u = 0x10000 + (loader->unichar_hi - 0xd800) * 0x400 + u - 0xdc00;
        }
        if (u < 0x80) {
          GRN_TEXT_PUTC(ctx, loader->last, u);
        } else {
          if (u < 0x800) {
            GRN_TEXT_PUTC(ctx, loader->last, (u >> 6) | 0xc0);
          } else {
            if (u < 0x10000) {
              GRN_TEXT_PUTC(ctx, loader->last, (u >> 12) | 0xe0);
            } else {
              GRN_TEXT_PUTC(ctx, loader->last, (u >> 18) | 0xf0);
              GRN_TEXT_PUTC(ctx, loader->last, ((u >> 12) & 0x3f) | 0x80);
            }
            GRN_TEXT_PUTC(ctx, loader->last, ((u >> 6) & 0x3f) | 0x80);
          }
          GRN_TEXT_PUTC(ctx, loader->last, (u & 0x3f) | 0x80);
        }
Esempio n. 8
0
static grn_bool
chunked_tokenize_utf8(grn_ctx *ctx,
                      grn_mecab_tokenizer *tokenizer,
                      const char *string,
                      unsigned int string_bytes)
{
  const char *chunk_start;
  const char *current;
  const char *last_delimiter;
  const char *string_end = string + string_bytes;
  grn_encoding encoding = tokenizer->query->encoding;

  if (string_bytes < grn_mecab_chunk_size_threshold) {
    return chunked_tokenize_utf8_chunk(ctx,
                                       tokenizer,
                                       string,
                                       string_bytes);
  }

  chunk_start = current = string;
  last_delimiter = NULL;
  while (current < string_end) {
    int space_bytes;
    int character_bytes;
    const char *current_character;

    space_bytes = grn_isspace(current, encoding);
    if (space_bytes > 0) {
      if (chunk_start != current) {
        grn_bool succeeded;
        succeeded = chunked_tokenize_utf8_chunk(ctx,
                                                tokenizer,
                                                chunk_start,
                                                current - chunk_start);
        if (!succeeded) {
          return succeeded;
        }
      }
      current += space_bytes;
      chunk_start = current;
      last_delimiter = NULL;
      continue;
    }

    character_bytes = grn_charlen_(ctx, current, string_end, encoding);
    if (character_bytes == 0) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                       "[tokenizer][mecab][chunk] "
                       "invalid byte sequence: position=%d",
                       (int)(current - string));
      return GRN_FALSE;
    }

    current_character = current;
    current += character_bytes;
    if (is_delimiter_character(ctx, current_character, character_bytes)) {
      last_delimiter = current;
    }

    if ((current - chunk_start) >= grn_mecab_chunk_size_threshold) {
      grn_bool succeeded;
      if (last_delimiter) {
        succeeded = chunked_tokenize_utf8_chunk(ctx,
                                                tokenizer,
                                                chunk_start,
                                                last_delimiter - chunk_start);
        chunk_start = last_delimiter;
      } else {
        succeeded = chunked_tokenize_utf8_chunk(ctx,
                                                tokenizer,
                                                chunk_start,
                                                current - chunk_start);
        chunk_start = current;
      }
      if (!succeeded) {
        return succeeded;
      }
      last_delimiter = NULL;
    }
  }

  if (current == chunk_start) {
    return GRN_TRUE;
  } else {
    return chunked_tokenize_utf8_chunk(ctx,
                                       tokenizer,
                                       chunk_start,
                                       current - chunk_start);
  }
}