uchar ft_simple_get_word(const CHARSET_INFO *cs, uchar **start, const uchar *end, FT_WORD *word, my_bool skip_stopwords) { uchar *doc= *start; uint mwc, length; int mbl; int ctype; DBUG_ENTER("ft_simple_get_word"); do { for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) { if (doc >= end) DBUG_RETURN(0); mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); if (true_word_char(ctype, *doc)) break; } mwc= length= 0; for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) { mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); if (true_word_char(ctype, *doc)) mwc= 0; else if (!misc_word_char(*doc) || mwc) break; else mwc++; } word->len= (uint)(doc-word->pos) - mwc; if (skip_stopwords == FALSE || (length >= ft_min_word_len && length < ft_max_word_len && !is_stopword((char*) word->pos, word->len))) { *start= doc; DBUG_RETURN(1); } } while (doc < end); DBUG_RETURN(0); }
byte ft_simple_get_word(CHARSET_INFO *cs, byte **start, const byte *end, FT_WORD *word, my_bool skip_stopwords) { byte *doc= *start; uint mwc, length, mbl; DBUG_ENTER("ft_simple_get_word"); do { for (;; doc+= (mbl ? mbl : 1)) { if (doc >= end) DBUG_RETURN(0); if (true_word_char(cs, *doc)) break; mbl= my_mbcharlen(cs, *(uchar *)doc); } mwc= length= 0; for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1)) if (true_word_char(cs,*doc)) mwc= 0; else if (!misc_word_char(*doc) || mwc) break; else mwc++; word->len= (uint)(doc-word->pos) - mwc; if (skip_stopwords == FALSE || (length >= ft_min_word_len && length < ft_max_word_len && !is_stopword(word->pos, word->len))) { *start= doc; DBUG_RETURN(1); } } while (doc < end); DBUG_RETURN(0); }
/* RETURN VALUE 0 - eof 1 - word found 2 - left bracket 3 - right bracket 4 - stopword found */ uchar ft_get_word(CHARSET_INFO *cs, const uchar **start, const uchar *end, FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param) { const uchar *doc=*start; int ctype; uint mwc, length; int mbl; param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); param->weight_adjust= param->wasign= 0; param->type= FT_TOKEN_EOF; while (doc<end) { for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) { mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); if (true_word_char(ctype, *doc)) break; if (*doc == FTB_RQUOT && param->quot) { *start=doc+1; param->type= FT_TOKEN_RIGHT_PAREN; goto ret; } if (!param->quot) { if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) { /* param->prev=' '; */ *start=doc+1; if (*doc == FTB_LQUOT) param->quot= (char*) 1; param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN); goto ret; } if (param->prev == ' ') { if (*doc == FTB_YES ) { param->yesno=+1; continue; } else if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else if (*doc == FTB_NO ) { param->yesno=-1; continue; } else if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; } } } param->prev=*doc; param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); param->weight_adjust= param->wasign= 0; } mwc=length=0; for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) { mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); if (true_word_char(ctype, *doc)) mwc=0; else if (!misc_word_char(*doc) || mwc) break; else mwc++; } param->prev='A'; /* be sure *prev is true_word_char */ word->len= (uint)(doc-word->pos) - mwc; if ((param->trunc=(doc<end && *doc == FTB_TRUNC))) doc++; if (((length >= ft_min_word_len && !is_stopword((char*) word->pos, word->len)) || param->trunc) && length < ft_max_word_len) { *start=doc; param->type= FT_TOKEN_WORD; goto ret; } else if (length) /* make sure length > 0 (if start contains spaces only) */ { *start= doc; param->type= FT_TOKEN_STOPWORD; goto ret; } } if (param->quot) { *start= doc; param->type= 3; /* FT_RBR */ goto ret; } ret: return param->type; }
/* RETURN VALUE 0 - eof 1 - word found 2 - left bracket 3 - right bracket 4 - stopword found */ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end, FT_WORD *word, FTB_PARAM *param) { byte *doc=*start; uint mwc, length, mbl; param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); param->plusminus=param->pmsign=0; while (doc<end) { for (; doc < end; doc+= mbl) { if (true_word_char(cs,*doc)) break; if (*doc == FTB_RQUOT && param->quot) { param->quot=doc; *start=doc+1; return 3; /* FTB_RBR */ } mbl= my_mbcharlen(cs, *(uchar *)doc); if (!param->quot) { if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) { /* param->prev=' '; */ *start=doc+1; if (*doc == FTB_LQUOT) param->quot=*start; return (*doc == FTB_RBR)+2; } if (param->prev == ' ') { if (*doc == FTB_YES ) { param->yesno=+1; continue; } else if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else if (*doc == FTB_NO ) { param->yesno=-1; continue; } else if (*doc == FTB_INC ) { param->plusminus++; continue; } else if (*doc == FTB_DEC ) { param->plusminus--; continue; } else if (*doc == FTB_NEG ) { param->pmsign=!param->pmsign; continue; } } } param->prev=*doc; param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); param->plusminus=param->pmsign=0; } mwc=length=0; for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1)) if (true_word_char(cs,*doc)) mwc=0; else if (!misc_word_char(*doc) || mwc) break; else mwc++; param->prev='A'; /* be sure *prev is true_word_char */ word->len= (uint)(doc-word->pos) - mwc; if ((param->trunc=(doc<end && *doc == FTB_TRUNC))) doc++; if (((length >= ft_min_word_len && !is_stopword(word->pos, word->len)) || param->trunc) && length < ft_max_word_len) { *start=doc; return 1; } else if (length) /* make sure length > 0 (if start contains spaces only) */ { *start= doc; return 4; } } if (param->quot) { param->quot=*start=doc; return 3; /* FTB_RBR */ } return 0; }