示例#1
0
static grn_obj *
sample_next(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args,
            grn_user_data *user_data)
{
  sample_tokenizer *tokenizer = user_data->ptr;
  grn_encoding encoding = tokenizer->query->encoding;
  grn_tokenizer_status status;
  const char *token;
  int token_length;

  token = tokenizer->next;
  token_length = grn_plugin_charlen(ctx, token, tokenizer->rest, encoding);
  if (token_length == 0 || tokenizer->rest - token_length == 0) {
    status = GRN_TOKENIZER_LAST;
  } else {
    status = GRN_TOKENIZER_CONTINUE;
  }
  grn_tokenizer_token_push(ctx, &(tokenizer->token),
                           token, token_length, status);

  tokenizer->next += token_length;
  tokenizer->rest -= token_length;

  return NULL;
}
示例#2
0
static grn_obj *
delimited_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_delimited_tokenizer *tokenizer = user_data->ptr;

  if (tokenizer->have_tokenized_delimiter) {
    unsigned int rest_length;
    rest_length = tokenizer->end - tokenizer->next;
    tokenizer->next =
      (unsigned char *)grn_tokenizer_tokenized_delimiter_next(
        ctx,
        &(tokenizer->token),
        (const char *)tokenizer->next,
        rest_length,
        tokenizer->query->encoding);
  } else {
    size_t cl;
    const unsigned char *p = tokenizer->next, *r;
    const unsigned char *e = tokenizer->end;
    grn_token_status status;
    for (r = p; r < e; r += cl) {
      if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e,
                              tokenizer->query->encoding))) {
        tokenizer->next = (unsigned char *)e;
        break;
      }
      {
        grn_bool found_delimiter = GRN_FALSE;
        const unsigned char *current_end = r;
        while (current_end + tokenizer->delimiter_len <= e &&
               !memcmp(current_end,
                       tokenizer->delimiter, tokenizer->delimiter_len)) {
          current_end += tokenizer->delimiter_len;
          tokenizer->next = current_end;
          found_delimiter = GRN_TRUE;
        }
        if (found_delimiter) {
          break;
        }
      }
    }
    if (r == e) {
      status = GRN_TOKEN_LAST;
    } else {
      status = GRN_TOKEN_CONTINUE;
    }
    grn_tokenizer_token_push(ctx,
                             &(tokenizer->token),
                             (const char *)p,
                             r - p,
                             status);
  }

  return NULL;
}
示例#3
0
/*
  This function returns tokens one by one.
 */
static grn_obj *
mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  /* grn_obj *table = args[0]; */
  grn_mecab_tokenizer *tokenizer = user_data->ptr;
  grn_encoding encoding = tokenizer->query->encoding;

  if (tokenizer->query->have_tokenized_delimiter) {
    tokenizer->next =
      grn_tokenizer_tokenized_delimiter_next(ctx,
                                             &(tokenizer->token),
                                             tokenizer->next,
                                             tokenizer->end - tokenizer->next,
                                             encoding);
  } else {
    size_t cl;
    const char *p = tokenizer->next, *r;
    const char *e = tokenizer->end;
    grn_tokenizer_status status;

    for (r = p; r < e; r += cl) {
      int space_len;

      space_len = grn_isspace(r, encoding);
      if (space_len > 0 && r == p) {
        cl = space_len;
        p = r + cl;
        continue;
      }

      if (!(cl = grn_charlen_(ctx, r, e, encoding))) {
        tokenizer->next = e;
        break;
      }

      if (space_len > 0) {
        const char *q = r + space_len;
        while (q < e && (space_len = grn_isspace(q, encoding))) {
          q += space_len;
        }
        tokenizer->next = q;
        break;
      }
    }

    if (r == e || tokenizer->next == e) {
      status = GRN_TOKENIZER_LAST;
    } else {
      status = GRN_TOKENIZER_CONTINUE;
    }
    grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status);
  }

  return NULL;
}
示例#4
0
static grn_obj *
uvector_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_uvector_tokenizer *tokenizer = user_data->ptr;
  byte *p = tokenizer->curr + tokenizer->unit;
  if (tokenizer->tail < p) {
    grn_tokenizer_token_push(ctx, &(tokenizer->token),
                             (const char *)tokenizer->curr, 0,
                             GRN_TOKEN_LAST);
  } else {
    grn_token_status status;
    if (tokenizer->tail == p) {
      status = GRN_TOKEN_LAST;
    } else {
      status = GRN_TOKEN_CONTINUE;
    }
    grn_tokenizer_token_push(ctx, &(tokenizer->token),
                             (const char *)tokenizer->curr, tokenizer->unit,
                             status);
    tokenizer->curr = p;
  }
  return NULL;
}
示例#5
0
文件: token.c 项目: henbow/groonga
static grn_obj *
delimited_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_delimited_tokenizer *tokenizer = user_data->ptr;

  if (tokenizer->have_tokenized_delimiter) {
    unsigned int rest_length;
    rest_length = tokenizer->end - tokenizer->next;
    tokenizer->next =
      grn_tokenizer_tokenized_delimiter_next(ctx,
                                             &(tokenizer->token),
                                             tokenizer->next,
                                             rest_length,
                                             tokenizer->query->encoding);
  } else {
    size_t cl;
    const unsigned char *p = tokenizer->next, *r;
    const unsigned char *e = tokenizer->end;
    grn_tokenizer_status status;
    for (r = p; r < e; r += cl) {
      if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e,
                              tokenizer->query->encoding))) {
        tokenizer->next = (unsigned char *)e;
        break;
      }
      if (r + tokenizer->delimiter_len <= e &&
          !memcmp(r, tokenizer->delimiter, tokenizer->delimiter_len)) {
        tokenizer->next = r + tokenizer->delimiter_len;
        break;
      }
    }
    if (r == e) {
      status = GRN_TOKENIZER_LAST;
    } else {
      status = GRN_TOKENIZER_CONTINUE;
    }
    grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status);
  }

  return NULL;
}
示例#6
0
const char *
grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx,
                                       grn_tokenizer_token *token,
                                       const char *str_ptr,
                                       unsigned int str_length,
                                       grn_encoding encoding)
{
  size_t char_length = 0;
  const char *start = str_ptr;
  const char *current;
  const char *end = str_ptr + str_length;
  const char *next_start = NULL;
  unsigned int token_length;
  grn_token_status status;

  for (current = start; current < end; current += char_length) {
    char_length = grn_charlen_(ctx, current, end, encoding);
    if (char_length == 0) {
      break;
    }
    if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length,
                                             encoding)) {
      next_start = str_ptr + (current - start + char_length);
      break;
    }
  }

  token_length = current - start;
  if (current == end) {
    status = GRN_TOKENIZER_LAST;
  } else {
    status = GRN_TOKENIZER_CONTINUE;
  }
  grn_tokenizer_token_push(ctx, token, start, token_length, status);

  return next_start;
}
示例#7
0
static grn_obj *
regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  int char_len;
  grn_token_status status = 0;
  grn_regexp_tokenizer *tokenizer = user_data->ptr;
  unsigned int n_characters = 0;
  int ngram_unit = 2;
  grn_obj *buffer = &(tokenizer->buffer);
  const char *current = tokenizer->next;
  const char *end = tokenizer->end;
  grn_tokenize_mode mode = tokenizer->query->tokenize_mode;
  grn_bool escaping = GRN_FALSE;

  GRN_BULK_REWIND(buffer);

  if (mode == GRN_TOKEN_GET) {
    if (tokenizer->get.have_begin) {
      grn_tokenizer_token_push(ctx,
                               &(tokenizer->token),
                               GRN_TOKENIZER_BEGIN_MARK_UTF8,
                               GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN,
                               status);
      tokenizer->get.have_begin = GRN_FALSE;
      return NULL;
    }

    if (tokenizer->is_end && tokenizer->get.have_end) {
      status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
      grn_tokenizer_token_push(ctx,
                               &(tokenizer->token),
                               GRN_TOKENIZER_END_MARK_UTF8,
                               GRN_TOKENIZER_END_MARK_UTF8_LEN,
                               status);
      return NULL;
    }
  } else {
    if (tokenizer->is_begin) {
      grn_tokenizer_token_push(ctx,
                               &(tokenizer->token),
                               GRN_TOKENIZER_BEGIN_MARK_UTF8,
                               GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN,
                               status);
      tokenizer->is_begin = GRN_FALSE;
      return NULL;
    }

    if (tokenizer->is_end) {
      status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
      grn_tokenizer_token_push(ctx,
                               &(tokenizer->token),
                               GRN_TOKENIZER_END_MARK_UTF8,
                               GRN_TOKENIZER_END_MARK_UTF8_LEN,
                               status);
      return NULL;
    }
  }

  char_len = grn_charlen_(ctx, current, end, tokenizer->query->encoding);
  if (char_len == 0) {
    status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
    grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status);
    return NULL;
  }

  while (GRN_TRUE) {
    if (!escaping && mode == GRN_TOKEN_GET &&
        char_len == 1 && current[0] == '\\') {
      current += char_len;
      escaping = GRN_TRUE;
    } else {
      n_characters++;
      GRN_TEXT_PUT(ctx, buffer, current, char_len);
      current += char_len;
      escaping = GRN_FALSE;
      if (n_characters == 1) {
        tokenizer->next = current;
      }
      if (n_characters == ngram_unit) {
        break;
      }
    }

    char_len = grn_charlen_(ctx, (const char *)current, (const char *)end,
                            tokenizer->query->encoding);
    if (char_len == 0) {
      break;
    }
  }

  if (tokenizer->is_overlapping) {
    status |= GRN_TOKEN_OVERLAP;
  }
  if (n_characters < ngram_unit) {
    status |= GRN_TOKEN_UNMATURED;
  }
  tokenizer->is_overlapping = (n_characters > 1);

  if (mode == GRN_TOKEN_GET) {
    if ((end - tokenizer->next) < ngram_unit) {
      if (tokenizer->get.have_end) {
        if (tokenizer->next == end) {
          tokenizer->is_end = GRN_TRUE;
        }
        if (status & GRN_TOKEN_UNMATURED) {
          if (tokenizer->is_first_token) {
            status |= GRN_TOKEN_FORCE_PREFIX;
          } else {
            status |= GRN_TOKEN_SKIP;
          }
        }
      } else {
        tokenizer->is_end = GRN_TRUE;
        status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
        if (status & GRN_TOKEN_UNMATURED) {
          status |= GRN_TOKEN_FORCE_PREFIX;
        }
      }
    } else {
      if (tokenizer->get.n_skip_tokens > 0) {
        tokenizer->get.n_skip_tokens--;
        status |= GRN_TOKEN_SKIP;
      } else {
        tokenizer->get.n_skip_tokens = ngram_unit - 1;
      }
    }
  } else {
    if (tokenizer->next == end) {
      tokenizer->is_end = GRN_TRUE;
    }
  }

  grn_tokenizer_token_push(ctx,
                           &(tokenizer->token),
                           GRN_TEXT_VALUE(buffer),
                           GRN_TEXT_LEN(buffer),
                           status);
  tokenizer->is_first_token = GRN_FALSE;

  return NULL;
}
示例#8
0
static grn_obj *
ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  size_t cl;
  grn_ngram_tokenizer *tokenizer = user_data->ptr;
  const unsigned char *p = tokenizer->next, *r = p, *e = tokenizer->end;
  int32_t len = 0, pos = tokenizer->pos + tokenizer->skip;
  grn_token_status status = 0;
  const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL;
  if (cp && tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) {
    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
                              tokenizer->query->encoding))) {
      len++;
      r += cl;
      if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; }
      if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_ALPHA) { break; }
    }
    tokenizer->next = r;
    tokenizer->overlap = 0;
  } else if (cp &&
             tokenizer->uni_digit &&
             GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) {
    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
                              tokenizer->query->encoding))) {
      len++;
      r += cl;
      if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; }
      if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_DIGIT) { break; }
    }
    tokenizer->next = r;
    tokenizer->overlap = 0;
  } else if (cp &&
             tokenizer->uni_symbol &&
             GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) {
    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
                              tokenizer->query->encoding))) {
      len++;
      r += cl;
      if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
      if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_SYMBOL) { break; }
    }
    tokenizer->next = r;
    tokenizer->overlap = 0;
  } else {
#ifdef PRE_DEFINED_UNSPLIT_WORDS
    const unsigned char *key = NULL;
    // todo : grn_pat_lcp_search
    if ((tid = grn_sym_common_prefix_search(sym, p))) {
      if (!(key = _grn_sym_key(sym, tid))) {
        tokenizer->status = GRN_TOKEN_CURSOR_NOT_FOUND;
        return NULL;
      }
      len = grn_str_len(key, tokenizer->query->encoding, NULL);
    }
    r = p + grn_charlen_(ctx, p, e, tokenizer->query->encoding);
    if (tid && (len > 1 || r == p)) {
      if (r != p && pos + len - 1 <= tokenizer->tail) { continue; }
      p += strlen(key);
      if (!*p && tokenizer->mode == GRN_TOKEN_GET) {
        tokenizer->status = GRN_TOKEN_CURSOR_DONE;
      }
    }
#endif /* PRE_DEFINED_UNSPLIT_WORDS */
    if ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
                           tokenizer->query->encoding))) {
      len++;
      r += cl;
      tokenizer->next = r;
      while (len < tokenizer->ngram_unit &&
             (cl = grn_charlen_(ctx, (char *)r, (char *)e,
                                tokenizer->query->encoding))) {
        if (cp) {
          if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
          cp++;
          if ((tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) ||
              (tokenizer->uni_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) ||
              (tokenizer->uni_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL)) {
            break;
          }
        }
        len++;
        r += cl;
      }
      if (tokenizer->overlap) {
        status |= GRN_TOKEN_OVERLAP;
      }
      if (len < tokenizer->ngram_unit) {
        status |= GRN_TOKEN_UNMATURED;
      }
      tokenizer->overlap = (len > 1) ? 1 : 0;
    }
  }
  tokenizer->pos = pos;
  tokenizer->len = len;
  tokenizer->tail = pos + len - 1;
  if (p == r || tokenizer->next == e) {
    tokenizer->skip = 0;
    status |= GRN_TOKEN_LAST;
  } else {
    tokenizer->skip = tokenizer->overlap ? 1 : len;
  }
  if (r == e) { status |= GRN_TOKEN_REACH_END; }
  grn_tokenizer_token_push(ctx,
                           &(tokenizer->token),
                           (const char *)p,
                           r - p,
                           status);
  return NULL;
}
示例#9
0
static grn_obj *
yangram_next(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args,
             grn_user_data *user_data)
{
  grn_yangram_tokenizer *tokenizer = user_data->ptr;
  const unsigned char *string_end = tokenizer->end;
  const unsigned char *token_top = tokenizer->next;
  const unsigned char *token_next = token_top;
  const unsigned char *token_tail = token_top;
  int token_size = 0;
  grn_bool is_token_grouped = GRN_FALSE;
  const unsigned char *token_ctypes = NULL;
  unsigned int ctypes_skip_size;
  int char_length = 0;
  grn_tokenizer_status status = 0;
  grn_bool is_token_hit = GRN_FALSE;
  grn_obj *lexicon = args[0];

  if (tokenizer->phrase_table) {
    if (tokenizer->nhits > 0 &&
        token_top - (const unsigned char *)tokenizer->scan_start >
        tokenizer->hits[tokenizer->current_hit].offset) {
      tokenizer->current_hit++;
    }
    if (tokenizer->current_hit >= tokenizer->nhits) {
      tokenizer->scan_start = tokenizer->scan_rest;
      unsigned int scan_rest_length = tokenizer->end - (const unsigned char *)tokenizer->scan_rest;
      if (scan_rest_length > 0) {
        tokenizer->nhits = grn_pat_scan(ctx, (grn_pat *)tokenizer->phrase_table,
                                        tokenizer->scan_rest,
                                        scan_rest_length,
                                        tokenizer->hits, MAX_N_HITS, &(tokenizer->scan_rest));
        tokenizer->current_hit = 0;
      }
    }
    if (tokenizer->nhits > 0 &&
        tokenizer->current_hit < tokenizer->nhits &&
        token_top - (const unsigned char *)tokenizer->scan_start ==
        tokenizer->hits[tokenizer->current_hit].offset) {
      is_token_hit = GRN_TRUE;
    }
  }

  if (tokenizer->ctypes) {
    token_ctypes = tokenizer->ctypes + tokenizer->ctypes_next;
  } else {
    token_ctypes = NULL;
  }

  if (is_token_hit) {
   token_size = forward_scan_hit_token_tail(ctx, tokenizer, &token_tail,
                                            tokenizer->hits[tokenizer->current_hit].length);
   token_next = token_tail;
   tokenizer->current_hit++;
  } else {
    is_token_grouped = is_token_group(tokenizer, token_ctypes);
    if (is_token_grouped) {
      token_size = forward_grouped_token_tail(ctx, tokenizer, token_ctypes, &token_tail);
      token_next = token_tail;
    } else {
      token_size = forward_ngram_token_tail(ctx, tokenizer, token_ctypes, &token_tail);
      char_length = grn_plugin_charlen(ctx, (char *)token_next,
                                       tokenizer->rest_length,
                                       tokenizer->query->encoding);
      token_next += char_length;
    }
  }

  if (token_top == token_tail || token_next == string_end) {
    ctypes_skip_size = 0;
  } else {
    if (is_token_grouped || is_token_hit) {
      ctypes_skip_size = token_size;
    } else {
      ctypes_skip_size = 1;
    }
  }

  if (tokenizer->use_vgram > 0 && !is_token_grouped) {
    grn_bool maybe_vgram = GRN_FALSE;

    grn_id id;
    id = grn_table_get(ctx, tokenizer->vgram_table,
                       (const char *)token_top, token_tail - token_top);
    if (id) {
      maybe_vgram = GRN_TRUE;
    }

    if (tokenizer->use_vgram >= VGRAM_BOTH && !maybe_vgram) {
      if (token_tail < string_end &&
          !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) {
        grn_id id;
        const unsigned char *token_next_tail;
        char_length = grn_plugin_charlen(ctx, (char *)token_tail,
                                         tokenizer->rest_length,
                                         tokenizer->query->encoding);
        token_next_tail = token_tail + char_length;
        id = grn_table_get(ctx, tokenizer->vgram_table,
                           (const char *)token_next, token_next_tail - token_next);
        if (id) {
          maybe_vgram = GRN_TRUE;
        }
      } else if (token_tail == string_end &&
                 tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) {
        maybe_vgram = GRN_TRUE;
      }
    }

    if (maybe_vgram) {
      if (token_tail < string_end &&
          !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) {
        char_length = grn_plugin_charlen(ctx, (char *)token_tail,
                                         tokenizer->rest_length,
                                         tokenizer->query->encoding);
        token_size++;
        token_tail += char_length;


        if (tokenizer->use_vgram == VGRAM_QUAD) {
          if (token_tail < string_end &&
              !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) {
            id = grn_table_get(ctx, tokenizer->vgram_table,
                               (const char *)token_top, token_tail - token_top);
            if (id) {
              char_length = grn_plugin_charlen(ctx, (char *)token_tail,
                                               tokenizer->rest_length,
                                               tokenizer->query->encoding);
              token_size++;
              token_tail += char_length;
            }
          } else {
            if (tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) {
              grn_id tid;
              tid = grn_table_get(ctx, lexicon,
                                  (const char *)token_top, token_tail - token_top);
              if (tid == GRN_ID_NIL) {
                int added;
                grn_table_add(ctx, lexicon,
                              (const char *)token_top, token_tail - token_top, &added);
              }
              status |= GRN_TOKEN_FORCE_PREFIX;
            }
          }
        }
      } else {
        if (tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) {
          grn_id tid;
          tid = grn_table_get(ctx, lexicon,
                             (const char *)token_top, token_tail - token_top);
          if (tid == GRN_ID_NIL) {
            int added;
            grn_table_add(ctx, lexicon,
                          (const char *)token_top, token_tail - token_top, &added);
          }
          status |= GRN_TOKEN_FORCE_PREFIX;
        }
      }
    }
  }

  if (token_top == token_tail || token_next == string_end) {
    status |= GRN_TOKEN_LAST;
  }

  if (token_tail == string_end) {
    status |= GRN_TOKEN_REACH_END;
  }

  if (!is_token_grouped && !is_token_hit && token_size < tokenizer->ngram_unit) {
    status |= GRN_TOKEN_UNMATURED;
  }

  if (tokenizer->pushed_token_tail &&
      token_top < tokenizer->pushed_token_tail) {
    status |= GRN_TOKEN_OVERLAP;
    if (tokenizer->skip_overlap &&
        !grn_ii_overlap_token_skip_enable &&
        !(status & GRN_TOKEN_REACH_END) &&
        !(status & GRN_TOKEN_SKIP_WITH_POSITION) &&
      tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) {
      if (token_tail <= tokenizer->pushed_token_tail) {
        status |= GRN_TOKEN_SKIP;
      } else {
        if (!is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) {
          status |= GRN_TOKEN_SKIP;
        }
      }
    }
  }

  if (!(status & GRN_TOKEN_SKIP) &&
      !(status & GRN_TOKEN_SKIP_WITH_POSITION)) {
    tokenizer->pushed_token_tail = token_tail;
  }

  tokenizer->next = token_next;
  tokenizer->rest_length = string_end - token_next;
  tokenizer->ctypes_next = tokenizer->ctypes_next + ctypes_skip_size;

  grn_tokenizer_token_push(ctx,
                           &(tokenizer->token),
                           (const char *)token_top,
                           token_tail - token_top,
                           status);

  return NULL;
}
示例#10
0
static grn_obj *
regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  int char_len;
  grn_token_status status = 0;
  grn_regexp_tokenizer *tokenizer = user_data->ptr;
  unsigned int n_characters = 0;
  int ngram_unit = 2;
  grn_obj *buffer = &(tokenizer->buffer);
  const char *current = tokenizer->next;
  const char *end = tokenizer->end;
  const const uint_least8_t *char_types = tokenizer->char_types;
  grn_tokenize_mode mode = tokenizer->query->tokenize_mode;
  grn_bool is_begin = tokenizer->is_begin;
  grn_bool is_start_token = tokenizer->is_start_token;
  grn_bool break_by_blank = GRN_FALSE;
  grn_bool break_by_end_mark = GRN_FALSE;

  GRN_BULK_REWIND(buffer);
  tokenizer->is_begin = GRN_FALSE;
  tokenizer->is_start_token = GRN_FALSE;

  if (char_types) {
    char_types += tokenizer->nth_char;
  }

  if (mode != GRN_TOKEN_GET) {
    if (is_begin) {
      grn_tokenizer_token_push(ctx,
                               &(tokenizer->token),
                               GRN_TOKENIZER_BEGIN_MARK_UTF8,
                               GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN,
                               status);
      return NULL;
    }

    if (tokenizer->is_end) {
      status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
      grn_tokenizer_token_push(ctx,
                               &(tokenizer->token),
                               GRN_TOKENIZER_END_MARK_UTF8,
                               GRN_TOKENIZER_END_MARK_UTF8_LEN,
                               status);
      return NULL;
    }
    if (is_start_token) {
      if (char_types && GRN_STR_ISBLANK(char_types[-1])) {
        status |= GRN_TOKEN_SKIP;
        grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status);
        return NULL;
      }
    }
  }

  char_len = grn_charlen_(ctx, current, end, tokenizer->query->encoding);
  if (char_len == 0) {
    status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
    grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status);
    return NULL;
  }

  if (mode == GRN_TOKEN_GET) {
    if (is_begin &&
        char_len == GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN &&
        memcmp(current, GRN_TOKENIZER_BEGIN_MARK_UTF8, char_len) == 0) {
      n_characters++;
      GRN_TEXT_PUT(ctx, buffer, current, char_len);
      current += char_len;
      tokenizer->next = current;
      tokenizer->nth_char++;
      if (current == end) {
        status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
      }
      grn_tokenizer_token_push(ctx,
                               &(tokenizer->token),
                               GRN_TOKENIZER_BEGIN_MARK_UTF8,
                               GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN,
                               status);
      return NULL;
    }

    if (current + char_len == end &&
        char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN &&
        memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) {
      status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
      grn_tokenizer_token_push(ctx,
                               &(tokenizer->token),
                               GRN_TOKENIZER_END_MARK_UTF8,
                               GRN_TOKENIZER_END_MARK_UTF8_LEN,
                               status);
      return NULL;
    }
  }

  while (GRN_TRUE) {
    n_characters++;
    GRN_TEXT_PUT(ctx, buffer, current, char_len);
    current += char_len;
    if (n_characters == 1) {
      tokenizer->next = current;
      tokenizer->nth_char++;
    }

    if (char_types) {
      uint_least8_t char_type;
      char_type = char_types[0];
      char_types++;
      if (GRN_STR_ISBLANK(char_type)) {
        break_by_blank = GRN_TRUE;
      }
    }

    char_len = grn_charlen_(ctx, (const char *)current, (const char *)end,
                            tokenizer->query->encoding);
    if (char_len == 0) {
      break;
    }

    if (mode == GRN_TOKEN_GET &&
        current + char_len == end &&
        char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN &&
        memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) {
      break_by_end_mark = GRN_TRUE;
    }

    if (break_by_blank || break_by_end_mark) {
      break;
    }

    if (n_characters == ngram_unit) {
      break;
    }
  }

  if (tokenizer->is_overlapping) {
    status |= GRN_TOKEN_OVERLAP;
  }
  if (n_characters < ngram_unit) {
    status |= GRN_TOKEN_UNMATURED;
  }
  tokenizer->is_overlapping = (n_characters > 1);

  if (mode == GRN_TOKEN_GET) {
    if (current == end) {
      tokenizer->is_end = GRN_TRUE;
      status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
      if (status & GRN_TOKEN_UNMATURED) {
        status |= GRN_TOKEN_FORCE_PREFIX;
      }
    } else {
      if (break_by_blank) {
        tokenizer->get.n_skip_tokens = 0;
        tokenizer->is_start_token = GRN_TRUE;
      } else if (break_by_end_mark) {
        if (!is_start_token && (status & GRN_TOKEN_UNMATURED)) {
          status |= GRN_TOKEN_SKIP;
        }
      } else if (tokenizer->get.n_skip_tokens > 0) {
        tokenizer->get.n_skip_tokens--;
        status |= GRN_TOKEN_SKIP;
      } else {
        tokenizer->get.n_skip_tokens = ngram_unit - 1;
      }
    }
  } else {
    if (tokenizer->next == end) {
      tokenizer->is_end = GRN_TRUE;
    }
    if (break_by_blank) {
      tokenizer->is_start_token = GRN_TRUE;
    }
  }

  grn_tokenizer_token_push(ctx,
                           &(tokenizer->token),
                           GRN_TEXT_VALUE(buffer),
                           GRN_TEXT_LEN(buffer),
                           status);

  return NULL;
}