Esempio n. 1
0
static int
forward_grouped_token_tail(grn_ctx *ctx, grn_yangram_tokenizer *tokenizer,
                           const unsigned char *ctypes,
                           const unsigned char **token_tail)
{
  int token_size = 0;
  unsigned int char_length;
  unsigned int rest_length = tokenizer->rest_length;
  if (ctypes &&
      tokenizer->split_alpha == GRN_FALSE &&
      GRN_STR_CTYPE(*ctypes) == GRN_CHAR_ALPHA) {
    while ((char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length,
                                             tokenizer->query->encoding))) {
      token_size++;
      *token_tail += char_length;
      rest_length -= char_length;
      if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*ctypes)) {
        break;
      }
      if (GRN_STR_CTYPE(*++ctypes) != GRN_CHAR_ALPHA) {
        break;
      }
    }
  } else if (ctypes &&
             tokenizer->split_digit == GRN_FALSE &&
             GRN_STR_CTYPE(*ctypes) == GRN_CHAR_DIGIT) {
    while ((char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length,
                                             tokenizer->query->encoding))) {
      token_size++;
      *token_tail += char_length;
      rest_length -= char_length;
      if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*ctypes)) {
        break;
      }
      if (GRN_STR_CTYPE(*++ctypes) != GRN_CHAR_DIGIT) {
        break;
      }
    }
  } else if (ctypes &&
             tokenizer->split_symbol == GRN_FALSE &&
             GRN_STR_CTYPE(*ctypes) == GRN_CHAR_SYMBOL) {
    while ((char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length,
                                             tokenizer->query->encoding))) {
      token_size++;
      *token_tail += char_length;
      rest_length -= char_length;
      if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*ctypes)) {
        break;
      }
      if (GRN_STR_CTYPE(*++ctypes) != GRN_CHAR_SYMBOL) {
        break;
      }
    }
  }
  return token_size;
}
Esempio n. 2
0
static int
forward_ngram_token_tail(grn_ctx *ctx, grn_yangram_tokenizer *tokenizer,
                         const unsigned char *ctypes,
                         const unsigned char **token_tail)
{
  int token_size = 0;
  unsigned int char_length;
  unsigned int rest_length = tokenizer->rest_length;

  if ((char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length,
                                        tokenizer->query->encoding))) {
    token_size++;
    *token_tail += char_length;
    rest_length -= char_length;
    while (token_size < tokenizer->ngram_unit &&
           (char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length,
                                             tokenizer->query->encoding))) {
      if (tokenizer->phrase_table) {
        if (tokenizer->nhits > 0 &&
            tokenizer->current_hit < tokenizer->nhits &&
            *token_tail - (const unsigned char *)tokenizer->scan_start ==
            tokenizer->hits[tokenizer->current_hit].offset) {
          break;
        }
      }

      if (ctypes) {
        if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*ctypes)) {
          break;
        }
       
        ctypes++;
        if ((tokenizer->split_alpha == GRN_FALSE &&
            GRN_STR_CTYPE(*ctypes) == GRN_CHAR_ALPHA) ||
            (tokenizer->split_digit == GRN_FALSE &&
            GRN_STR_CTYPE(*ctypes) == GRN_CHAR_DIGIT) ||
            (tokenizer->split_symbol == GRN_FALSE &&
            GRN_STR_CTYPE(*ctypes) == GRN_CHAR_SYMBOL)) {
          break;
        }
      }
      token_size++;
      *token_tail += char_length;
      rest_length -= char_length;
    }
  }
  return token_size;
}
Esempio n. 3
0
static grn_obj *
sample_next(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args,
            grn_user_data *user_data)
{
  sample_tokenizer *tokenizer = user_data->ptr;
  grn_encoding encoding = tokenizer->query->encoding;
  grn_tokenizer_status status;
  const char *token;
  int token_length;

  token = tokenizer->next;
  token_length = grn_plugin_charlen(ctx, token, tokenizer->rest, encoding);
  if (token_length == 0 || tokenizer->rest - token_length == 0) {
    status = GRN_TOKENIZER_LAST;
  } else {
    status = GRN_TOKENIZER_CONTINUE;
  }
  grn_tokenizer_token_push(ctx, &(tokenizer->token),
                           token, token_length, status);

  tokenizer->next += token_length;
  tokenizer->rest -= token_length;

  return NULL;
}
Esempio n. 4
0
static int
forward_scan_hit_token_tail(grn_ctx *ctx, grn_yangram_tokenizer *tokenizer,
                            const unsigned char **token_tail,
                            unsigned int scan_length)
{
  int token_size = 0;
  unsigned int char_length;
  unsigned int rest_length = tokenizer->rest_length;
  const unsigned char *token_top = *token_tail;

  while ((char_length = grn_plugin_charlen(ctx, (char *)*token_tail, rest_length,
                                           tokenizer->query->encoding))) {
    token_size++;
    *token_tail += char_length;
    rest_length -= char_length;
    if (*token_tail - token_top >= scan_length) {
      break;
    }
  }
  return token_size;
}
Esempio n. 5
0
/*
  Just for backward compatibility. See grn_plugin_charlen() instead.
 */
int
grn_tokenizer_charlen(grn_ctx *ctx, const char *str_ptr,
                      unsigned int str_length, grn_encoding encoding)
{
  return grn_plugin_charlen(ctx, str_ptr, str_length, encoding);
}
Esempio n. 6
0
static grn_obj *
yangram_next(grn_ctx *ctx, GNUC_UNUSED int nargs, GNUC_UNUSED grn_obj **args,
             grn_user_data *user_data)
{
  grn_yangram_tokenizer *tokenizer = user_data->ptr;
  const unsigned char *string_end = tokenizer->end;
  const unsigned char *token_top = tokenizer->next;
  const unsigned char *token_next = token_top;
  const unsigned char *token_tail = token_top;
  int token_size = 0;
  grn_bool is_token_grouped = GRN_FALSE;
  const unsigned char *token_ctypes = NULL;
  unsigned int ctypes_skip_size;
  int char_length = 0;
  grn_tokenizer_status status = 0;
  grn_bool is_token_hit = GRN_FALSE;
  grn_obj *lexicon = args[0];

  if (tokenizer->phrase_table) {
    if (tokenizer->nhits > 0 &&
        token_top - (const unsigned char *)tokenizer->scan_start >
        tokenizer->hits[tokenizer->current_hit].offset) {
      tokenizer->current_hit++;
    }
    if (tokenizer->current_hit >= tokenizer->nhits) {
      tokenizer->scan_start = tokenizer->scan_rest;
      unsigned int scan_rest_length = tokenizer->end - (const unsigned char *)tokenizer->scan_rest;
      if (scan_rest_length > 0) {
        tokenizer->nhits = grn_pat_scan(ctx, (grn_pat *)tokenizer->phrase_table,
                                        tokenizer->scan_rest,
                                        scan_rest_length,
                                        tokenizer->hits, MAX_N_HITS, &(tokenizer->scan_rest));
        tokenizer->current_hit = 0;
      }
    }
    if (tokenizer->nhits > 0 &&
        tokenizer->current_hit < tokenizer->nhits &&
        token_top - (const unsigned char *)tokenizer->scan_start ==
        tokenizer->hits[tokenizer->current_hit].offset) {
      is_token_hit = GRN_TRUE;
    }
  }

  if (tokenizer->ctypes) {
    token_ctypes = tokenizer->ctypes + tokenizer->ctypes_next;
  } else {
    token_ctypes = NULL;
  }

  if (is_token_hit) {
   token_size = forward_scan_hit_token_tail(ctx, tokenizer, &token_tail,
                                            tokenizer->hits[tokenizer->current_hit].length);
   token_next = token_tail;
   tokenizer->current_hit++;
  } else {
    is_token_grouped = is_token_group(tokenizer, token_ctypes);
    if (is_token_grouped) {
      token_size = forward_grouped_token_tail(ctx, tokenizer, token_ctypes, &token_tail);
      token_next = token_tail;
    } else {
      token_size = forward_ngram_token_tail(ctx, tokenizer, token_ctypes, &token_tail);
      char_length = grn_plugin_charlen(ctx, (char *)token_next,
                                       tokenizer->rest_length,
                                       tokenizer->query->encoding);
      token_next += char_length;
    }
  }

  if (token_top == token_tail || token_next == string_end) {
    ctypes_skip_size = 0;
  } else {
    if (is_token_grouped || is_token_hit) {
      ctypes_skip_size = token_size;
    } else {
      ctypes_skip_size = 1;
    }
  }

  if (tokenizer->use_vgram > 0 && !is_token_grouped) {
    grn_bool maybe_vgram = GRN_FALSE;

    grn_id id;
    id = grn_table_get(ctx, tokenizer->vgram_table,
                       (const char *)token_top, token_tail - token_top);
    if (id) {
      maybe_vgram = GRN_TRUE;
    }

    if (tokenizer->use_vgram >= VGRAM_BOTH && !maybe_vgram) {
      if (token_tail < string_end &&
          !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) {
        grn_id id;
        const unsigned char *token_next_tail;
        char_length = grn_plugin_charlen(ctx, (char *)token_tail,
                                         tokenizer->rest_length,
                                         tokenizer->query->encoding);
        token_next_tail = token_tail + char_length;
        id = grn_table_get(ctx, tokenizer->vgram_table,
                           (const char *)token_next, token_next_tail - token_next);
        if (id) {
          maybe_vgram = GRN_TRUE;
        }
      } else if (token_tail == string_end &&
                 tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) {
        maybe_vgram = GRN_TRUE;
      }
    }

    if (maybe_vgram) {
      if (token_tail < string_end &&
          !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) {
        char_length = grn_plugin_charlen(ctx, (char *)token_tail,
                                         tokenizer->rest_length,
                                         tokenizer->query->encoding);
        token_size++;
        token_tail += char_length;


        if (tokenizer->use_vgram == VGRAM_QUAD) {
          if (token_tail < string_end &&
              !is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) {
            id = grn_table_get(ctx, tokenizer->vgram_table,
                               (const char *)token_top, token_tail - token_top);
            if (id) {
              char_length = grn_plugin_charlen(ctx, (char *)token_tail,
                                               tokenizer->rest_length,
                                               tokenizer->query->encoding);
              token_size++;
              token_tail += char_length;
            }
          } else {
            if (tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) {
              grn_id tid;
              tid = grn_table_get(ctx, lexicon,
                                  (const char *)token_top, token_tail - token_top);
              if (tid == GRN_ID_NIL) {
                int added;
                grn_table_add(ctx, lexicon,
                              (const char *)token_top, token_tail - token_top, &added);
              }
              status |= GRN_TOKEN_FORCE_PREFIX;
            }
          }
        }
      } else {
        if (tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) {
          grn_id tid;
          tid = grn_table_get(ctx, lexicon,
                             (const char *)token_top, token_tail - token_top);
          if (tid == GRN_ID_NIL) {
            int added;
            grn_table_add(ctx, lexicon,
                          (const char *)token_top, token_tail - token_top, &added);
          }
          status |= GRN_TOKEN_FORCE_PREFIX;
        }
      }
    }
  }

  if (token_top == token_tail || token_next == string_end) {
    status |= GRN_TOKEN_LAST;
  }

  if (token_tail == string_end) {
    status |= GRN_TOKEN_REACH_END;
  }

  if (!is_token_grouped && !is_token_hit && token_size < tokenizer->ngram_unit) {
    status |= GRN_TOKEN_UNMATURED;
  }

  if (tokenizer->pushed_token_tail &&
      token_top < tokenizer->pushed_token_tail) {
    status |= GRN_TOKEN_OVERLAP;
    if (tokenizer->skip_overlap &&
        !grn_ii_overlap_token_skip_enable &&
        !(status & GRN_TOKEN_REACH_END) &&
        !(status & GRN_TOKEN_SKIP_WITH_POSITION) &&
      tokenizer->query->tokenize_mode == GRN_TOKENIZE_GET) {
      if (token_tail <= tokenizer->pushed_token_tail) {
        status |= GRN_TOKEN_SKIP;
      } else {
        if (!is_group_border(ctx, tokenizer, token_tail, token_ctypes, token_size)) {
          status |= GRN_TOKEN_SKIP;
        }
      }
    }
  }

  if (!(status & GRN_TOKEN_SKIP) &&
      !(status & GRN_TOKEN_SKIP_WITH_POSITION)) {
    tokenizer->pushed_token_tail = token_tail;
  }

  tokenizer->next = token_next;
  tokenizer->rest_length = string_end - token_next;
  tokenizer->ctypes_next = tokenizer->ctypes_next + ctypes_skip_size;

  grn_tokenizer_token_push(ctx,
                           &(tokenizer->token),
                           (const char *)token_top,
                           token_tail - token_top,
                           status);

  return NULL;
}