static my_bool init_state_maps(CHARSET_INFO *cs) { uint i; uchar *state_map; uchar *ident_map; if (!(cs->state_map= state_map= (uchar*) my_once_alloc(256, MYF(MY_WME)))) return 1; if (!(cs->ident_map= ident_map= (uchar*) my_once_alloc(256, MYF(MY_WME)))) return 1; /* Fill state_map with states to get a faster parser */ for (i=0; i < 256 ; i++) { if (my_isalpha(cs,i)) state_map[i]=(uchar) MY_LEX_IDENT; else if (my_isdigit(cs,i)) state_map[i]=(uchar) MY_LEX_NUMBER_IDENT; else if (my_mbcharlen(cs, i)>1) state_map[i]=(uchar) MY_LEX_IDENT; else if (my_isspace(cs,i)) state_map[i]=(uchar) MY_LEX_SKIP; else state_map[i]=(uchar) MY_LEX_CHAR; } state_map[(uchar)'_']=state_map[(uchar)'$']=(uchar) MY_LEX_IDENT; state_map[(uchar)'\'']=(uchar) MY_LEX_STRING; state_map[(uchar)'.']=(uchar) MY_LEX_REAL_OR_POINT; state_map[(uchar)'>']=state_map[(uchar)'=']=state_map[(uchar)'!']= (uchar) MY_LEX_CMP_OP; state_map[(uchar)'<']= (uchar) MY_LEX_LONG_CMP_OP; state_map[(uchar)'&']=state_map[(uchar)'|']=(uchar) MY_LEX_BOOL; state_map[(uchar)'#']=(uchar) MY_LEX_COMMENT; state_map[(uchar)';']=(uchar) MY_LEX_SEMICOLON; state_map[(uchar)':']=(uchar) MY_LEX_SET_VAR; state_map[0]=(uchar) MY_LEX_EOL; state_map[(uchar)'\\']= (uchar) MY_LEX_ESCAPE; state_map[(uchar)'/']= (uchar) MY_LEX_LONG_COMMENT; state_map[(uchar)'*']= (uchar) MY_LEX_END_LONG_COMMENT; state_map[(uchar)'@']= (uchar) MY_LEX_USER_END; state_map[(uchar) '`']= (uchar) MY_LEX_USER_VARIABLE_DELIMITER; state_map[(uchar)'"']= (uchar) MY_LEX_STRING_OR_DELIMITER; /* Create a second map to make it faster to find identifiers */ for (i=0; i < 256 ; i++) { ident_map[i]= (uchar) (state_map[i] == MY_LEX_IDENT || state_map[i] == MY_LEX_NUMBER_IDENT); } /* Special handling of hex and binary strings */ state_map[(uchar)'x']= state_map[(uchar)'X']= (uchar) MY_LEX_IDENT_OR_HEX; state_map[(uchar)'b']= state_map[(uchar)'B']= (uchar) MY_LEX_IDENT_OR_BIN; state_map[(uchar)'n']= state_map[(uchar)'N']= (uchar) MY_LEX_IDENT_OR_NCHAR; return 0; }
byte ft_simple_get_word(CHARSET_INFO *cs, byte **start, const byte *end, FT_WORD *word, my_bool skip_stopwords) { byte *doc= *start; uint mwc, length, mbl; DBUG_ENTER("ft_simple_get_word"); do { for (;; doc+= (mbl ? mbl : 1)) { if (doc >= end) DBUG_RETURN(0); if (true_word_char(cs, *doc)) break; mbl= my_mbcharlen(cs, *(uchar *)doc); } mwc= length= 0; for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1)) if (true_word_char(cs,*doc)) mwc= 0; else if (!misc_word_char(*doc) || mwc) break; else mwc++; word->len= (uint)(doc-word->pos) - mwc; if (skip_stopwords == FALSE || (length >= ft_min_word_len && length < ft_max_word_len && !is_stopword(word->pos, word->len))) { *start= doc; DBUG_RETURN(1); } } while (doc < end); DBUG_RETURN(0); }
char *my_strchr(CHARSET_INFO *cs, const char *str, const char *end, pchar c) { uint mbl; while (str < end) { mbl= my_mbcharlen(cs, *(uchar *)str); if (mbl < 2) { if (*str == c) return((char *)str); str++; } else str+= mbl; } return(0); }
static char *backtick_string(CHARSET_INFO *cs, char *to, char *end, char *par, size_t par_len, char quote_char) { uint char_len; char *start= to; char *par_end= par + par_len; size_t buff_length= (size_t) (end - to); if (buff_length <= par_len) goto err; *start++= quote_char; for ( ; par < par_end; par+= char_len) { uchar c= *(uchar *) par; if (!(char_len= my_mbcharlen(cs, c))) char_len= 1; if (char_len == 1 && c == (uchar) quote_char ) { if (start + 1 >= end) goto err; *start++= quote_char; } if (start + char_len >= end) goto err; start= strnmov(start, par, char_len); } if (start + 1 >= end) goto err; *start++= quote_char; return start; err: *to='\0'; return to; }
size_t escape_string_for_mysql(CHARSET_INFO *charset_info, char *to, size_t to_length, const char *from, size_t length) { const char *to_start= to; const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length); my_bool overflow= FALSE; #ifdef USE_MB my_bool use_mb_flag= use_mb(charset_info); #endif for (end= from + length; from < end; from++) { char escape= 0; #ifdef USE_MB int tmp_length; if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end))) { if (to + tmp_length > to_end) { overflow= TRUE; break; } while (tmp_length--) *to++= *from++; from--; continue; } /* If the next character appears to begin a multi-byte character, we escape that first byte of that apparent multi-byte character. (The character just looks like a multi-byte character -- if it were actually a multi-byte character, it would have been passed through in the test above.) Without this check, we can create a problem by converting an invalid multi-byte character into a valid one. For example, 0xbf27 is not a valid GBK character, but 0xbf5c is. (0x27 = ', 0x5c = \) */ if (use_mb_flag && (tmp_length= my_mbcharlen(charset_info, *from)) > 1) escape= *from; else #endif switch (*from) { case 0: /* Must be escaped for 'mysql' */ escape= '0'; break; case '\n': /* Must be escaped for logs */ escape= 'n'; break; case '\r': escape= 'r'; break; case '\\': escape= '\\'; break; case '\'': escape= '\''; break; case '"': /* Better safe than sorry */ escape= '"'; break; case '\032': /* This gives problems on Win32 */ escape= 'Z'; break; } if (escape) { if (to + 2 > to_end) { overflow= TRUE; break; } *to++= '\\'; *to++= escape; } else { if (to + 1 > to_end) { overflow= TRUE; break; } *to++= *from; } } *to= 0; return overflow ? (size_t) -1 : (size_t) (to - to_start); }
/* RETURN VALUE 0 - eof 1 - word found 2 - left bracket 3 - right bracket 4 - stopword found */ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end, FT_WORD *word, FTB_PARAM *param) { byte *doc=*start; uint mwc, length, mbl; param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); param->plusminus=param->pmsign=0; while (doc<end) { for (; doc < end; doc+= mbl) { if (true_word_char(cs,*doc)) break; if (*doc == FTB_RQUOT && param->quot) { param->quot=doc; *start=doc+1; return 3; /* FTB_RBR */ } mbl= my_mbcharlen(cs, *(uchar *)doc); if (!param->quot) { if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) { /* param->prev=' '; */ *start=doc+1; if (*doc == FTB_LQUOT) param->quot=*start; return (*doc == FTB_RBR)+2; } if (param->prev == ' ') { if (*doc == FTB_YES ) { param->yesno=+1; continue; } else if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else if (*doc == FTB_NO ) { param->yesno=-1; continue; } else if (*doc == FTB_INC ) { param->plusminus++; continue; } else if (*doc == FTB_DEC ) { param->plusminus--; continue; } else if (*doc == FTB_NEG ) { param->pmsign=!param->pmsign; continue; } } } param->prev=*doc; param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); param->plusminus=param->pmsign=0; } mwc=length=0; for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1)) if (true_word_char(cs,*doc)) mwc=0; else if (!misc_word_char(*doc) || mwc) break; else mwc++; param->prev='A'; /* be sure *prev is true_word_char */ word->len= (uint)(doc-word->pos) - mwc; if ((param->trunc=(doc<end && *doc == FTB_TRUNC))) doc++; if (((length >= ft_min_word_len && !is_stopword(word->pos, word->len)) || param->trunc) && length < ft_max_word_len) { *start=doc; return 1; } else if (length) /* make sure length > 0 (if start contains spaces only) */ { *start= doc; return 4; } } if (param->quot) { param->quot=*start=doc; return 3; /* FTB_RBR */ } return 0; }