コード例 #1
0
ファイル: ft_parser.c プロジェクト: Baoxiyi-Github/Mysql
uchar ft_simple_get_word(const CHARSET_INFO *cs, uchar **start,
                         const uchar *end,
                         FT_WORD *word, my_bool skip_stopwords)
{
  uchar *doc= *start;
  uint mwc, length;
  int mbl;
  int ctype;
  DBUG_ENTER("ft_simple_get_word");

  do
  {
    for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
    {
      if (doc >= end)
        DBUG_RETURN(0);
      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
      if (true_word_char(ctype, *doc))
        break;
    }

    mwc= length= 0;
    for (word->pos= doc; doc < end; length++,
         doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
    {
      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
      if (true_word_char(ctype, *doc))
        mwc= 0;
      else if (!misc_word_char(*doc) || mwc)
        break;
      else
        mwc++;
    }

    word->len= (uint)(doc-word->pos) - mwc;

    if (skip_stopwords == FALSE ||
        (length >= ft_min_word_len && length < ft_max_word_len &&
         !is_stopword((char*) word->pos, word->len)))
    {
      *start= doc;
      DBUG_RETURN(1);
    }
  } while (doc < end);
  DBUG_RETURN(0);
}
コード例 #2
0
ファイル: ft_parser.c プロジェクト: isleon/Jaxer
byte ft_simple_get_word(CHARSET_INFO *cs, byte **start, const byte *end,
                        FT_WORD *word, my_bool skip_stopwords)
{
  byte *doc= *start;
  uint mwc, length, mbl;
  DBUG_ENTER("ft_simple_get_word");

  do
  {
    for (;; doc+= (mbl ? mbl : 1))
    {
      if (doc >= end) DBUG_RETURN(0);
      if (true_word_char(cs, *doc)) break;
      mbl= my_mbcharlen(cs, *(uchar *)doc);
    }

    mwc= length= 0;
    for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1))
      if (true_word_char(cs,*doc))
        mwc= 0;
      else if (!misc_word_char(*doc) || mwc)
        break;
      else
        mwc++;

    word->len= (uint)(doc-word->pos) - mwc;

    if (skip_stopwords == FALSE ||
        (length >= ft_min_word_len && length < ft_max_word_len &&
         !is_stopword(word->pos, word->len)))
    {
      *start= doc;
      DBUG_RETURN(1);
    }
  } while (doc < end);
  DBUG_RETURN(0);
}
コード例 #3
0
/*
  RETURN VALUE
  0 - eof
  1 - word found
  2 - left bracket
  3 - right bracket
  4 - stopword found
*/
uchar ft_get_word(CHARSET_INFO *cs, const uchar **start, const uchar *end,
                  FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
{
  const uchar *doc=*start;
  int ctype;
  uint mwc, length;
  int mbl;

  param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
  param->weight_adjust= param->wasign= 0;
  param->type= FT_TOKEN_EOF;

  while (doc<end)
  {
    for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
    {
      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
      if (true_word_char(ctype, *doc))
        break;
      if (*doc == FTB_RQUOT && param->quot)
      {
        *start=doc+1;
        param->type= FT_TOKEN_RIGHT_PAREN;
        goto ret;
      }
      if (!param->quot)
      {
        if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
        {
          /* param->prev=' '; */
          *start=doc+1;
          if (*doc == FTB_LQUOT)
            param->quot= (char*) 1;
          param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN);
          goto ret;
        }
        if (param->prev == ' ')
        {
          if (*doc == FTB_YES ) { param->yesno=+1;    continue; } else
          if (*doc == FTB_EGAL) { param->yesno= 0;    continue; } else
          if (*doc == FTB_NO  ) { param->yesno=-1;    continue; } else
          if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else
          if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else
          if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; }
        }
      }
      param->prev=*doc;
      param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
      param->weight_adjust= param->wasign= 0;
    }

    mwc=length=0;
    for (word->pos= doc; doc < end; length++,
         doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
    {
      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
      if (true_word_char(ctype, *doc))
        mwc=0;
      else if (!misc_word_char(*doc) || mwc)
        break;
      else
        mwc++;
    }
    param->prev='A'; /* be sure *prev is true_word_char */
    word->len= (uint)(doc-word->pos) - mwc;
    if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
      doc++;

    if (((length >= ft_min_word_len && !is_stopword((char*) word->pos,
                                                    word->len))
         || param->trunc) && length < ft_max_word_len)
    {
      *start=doc;
      param->type= FT_TOKEN_WORD;
      goto ret;
    }
    else if (length) /* make sure length > 0 (if start contains spaces only) */
    {
      *start= doc;
      param->type= FT_TOKEN_STOPWORD;
      goto ret;
    }
  }
  if (param->quot)
  {
    *start= doc;
    param->type= 3; /* FT_RBR */
    goto ret;
  }
ret:
  return param->type;
}
コード例 #4
0
ファイル: ft_parser.c プロジェクト: isleon/Jaxer
/*
  RETURN VALUE
  0 - eof
  1 - word found
  2 - left bracket
  3 - right bracket
  4 - stopword found
*/
byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end,
                 FT_WORD *word, FTB_PARAM *param)
{
  byte *doc=*start;
  uint mwc, length, mbl;

  param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
  param->plusminus=param->pmsign=0;

  while (doc<end)
  {
    for (; doc < end; doc+= mbl)
    {
      if (true_word_char(cs,*doc)) break;
      if (*doc == FTB_RQUOT && param->quot)
      {
        param->quot=doc;
        *start=doc+1;
        return 3; /* FTB_RBR */
      }
      mbl= my_mbcharlen(cs, *(uchar *)doc);
      if (!param->quot)
      {
        if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
        {
          /* param->prev=' '; */
          *start=doc+1;
          if (*doc == FTB_LQUOT) param->quot=*start;
          return (*doc == FTB_RBR)+2;
        }
        if (param->prev == ' ')
        {
          if (*doc == FTB_YES ) { param->yesno=+1;    continue; } else
          if (*doc == FTB_EGAL) { param->yesno= 0;    continue; } else
          if (*doc == FTB_NO  ) { param->yesno=-1;    continue; } else
          if (*doc == FTB_INC ) { param->plusminus++; continue; } else
          if (*doc == FTB_DEC ) { param->plusminus--; continue; } else
          if (*doc == FTB_NEG ) { param->pmsign=!param->pmsign; continue; }
        }
      }
      param->prev=*doc;
      param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
      param->plusminus=param->pmsign=0;
    }

    mwc=length=0;
    for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1))
      if (true_word_char(cs,*doc))
        mwc=0;
      else if (!misc_word_char(*doc) || mwc)
        break;
      else
        mwc++;

    param->prev='A'; /* be sure *prev is true_word_char */
    word->len= (uint)(doc-word->pos) - mwc;
    if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
      doc++;

    if (((length >= ft_min_word_len && !is_stopword(word->pos, word->len))
         || param->trunc) && length < ft_max_word_len)
    {
      *start=doc;
      return 1;
    }
    else if (length) /* make sure length > 0 (if start contains spaces only) */
    {
      *start= doc;
      return 4;
    }
  }
  if (param->quot)
  {
    param->quot=*start=doc;
    return 3; /* FTB_RBR */
  }
  return 0;
}