uchar ft_simple_get_word(const CHARSET_INFO *cs, uchar **start, const uchar *end, FT_WORD *word, my_bool skip_stopwords) { uchar *doc= *start; uint mwc, length; int mbl; int ctype; DBUG_ENTER("ft_simple_get_word"); do { for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) { if (doc >= end) DBUG_RETURN(0); mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); if (true_word_char(ctype, *doc)) break; } mwc= length= 0; for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) { mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); if (true_word_char(ctype, *doc)) mwc= 0; else if (!misc_word_char(*doc) || mwc) break; else mwc++; } word->len= (uint)(doc-word->pos) - mwc; if (skip_stopwords == FALSE || (length >= ft_min_word_len && length < ft_max_word_len && !is_stopword((char*) word->pos, word->len))) { *start= doc; DBUG_RETURN(1); } } while (doc < end); DBUG_RETURN(0); }
byte ft_simple_get_word(CHARSET_INFO *cs, byte **start, const byte *end, FT_WORD *word, my_bool skip_stopwords) { byte *doc= *start; uint mwc, length, mbl; DBUG_ENTER("ft_simple_get_word"); do { for (;; doc+= (mbl ? mbl : 1)) { if (doc >= end) DBUG_RETURN(0); if (true_word_char(cs, *doc)) break; mbl= my_mbcharlen(cs, *(uchar *)doc); } mwc= length= 0; for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1)) if (true_word_char(cs,*doc)) mwc= 0; else if (!misc_word_char(*doc) || mwc) break; else mwc++; word->len= (uint)(doc-word->pos) - mwc; if (skip_stopwords == FALSE || (length >= ft_min_word_len && length < ft_max_word_len && !is_stopword(word->pos, word->len))) { *start= doc; DBUG_RETURN(1); } } while (doc < end); DBUG_RETURN(0); }
/* RETURN VALUE 0 - eof 1 - word found 2 - left bracket 3 - right bracket 4 - stopword found */ uchar ft_get_word(CHARSET_INFO *cs, const uchar **start, const uchar *end, FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param) { const uchar *doc=*start; int ctype; uint mwc, length; int mbl; param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); param->weight_adjust= param->wasign= 0; param->type= FT_TOKEN_EOF; while (doc<end) { for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) { mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); if (true_word_char(ctype, *doc)) break; if (*doc == FTB_RQUOT && param->quot) { *start=doc+1; param->type= FT_TOKEN_RIGHT_PAREN; goto ret; } if (!param->quot) { if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) { /* param->prev=' '; */ *start=doc+1; if (*doc == FTB_LQUOT) param->quot= (char*) 1; param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN); goto ret; } if (param->prev == ' ') { if (*doc == FTB_YES ) { param->yesno=+1; continue; } else if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else if (*doc == FTB_NO ) { param->yesno=-1; continue; } else if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; } } } param->prev=*doc; param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); param->weight_adjust= param->wasign= 0; } mwc=length=0; for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) { mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); if (true_word_char(ctype, *doc)) mwc=0; else if (!misc_word_char(*doc) || mwc) break; else mwc++; } param->prev='A'; /* be sure *prev is true_word_char */ word->len= (uint)(doc-word->pos) - mwc; if ((param->trunc=(doc<end && *doc == FTB_TRUNC))) doc++; if (((length >= ft_min_word_len && !is_stopword((char*) word->pos, word->len)) || param->trunc) && length < ft_max_word_len) { *start=doc; param->type= FT_TOKEN_WORD; goto ret; } else if (length) /* make sure length > 0 (if start contains spaces only) */ { *start= doc; param->type= FT_TOKEN_STOPWORD; goto ret; } } if (param->quot) { *start= doc; param->type= 3; /* FT_RBR */ goto ret; } ret: return param->type; }
/* * Parses a file and adds its words to the index */ void parse_file_for_index(index_p index, char *file) { // open file or print error message FILE *f = fopen(file, "r"); if (!f) { printf("Cannot open %s!\nIndex not updated.\n", file); return; } // document id = index of document in list of all documents in filebase (alphabetically ordered) int doc_id = find_str(&index->documents[0].name, sizeof(indexed_document_t), file, 0, index->nr_docs-1); if (doc_id < 0) { printf("Error: %s is not in the filebase!\n", file); return; } char *l; while ((l = read_line(f))) { // turn non alpha characters into spaces nonalpha_to_space(l); char *word = strtok(l, " "); while (word) { // ignore stopwords if (is_stopword(word)) { word = strtok(NULL, " "); continue; } char *word_stem = stem(word); if (!strlen(word_stem)) { word = strtok(NULL, " "); continue; } // insert document into index / add new stem to index indexed_word_p w = index->words; // current word indexed_word_p p = NULL; // previous word int flag = 0; while (w && !flag) { int cmp = strcmp(w->stem, word_stem); if (!cmp) { // stem is already indexed flag = 1; break; } else if (0 < cmp) { // stem not indexed yet flag = 2; break; } p = w; w = w->next; } if (flag == 1) { // stem indexed, add document to list int i; for (i = 0; i < w->nr_docs; i++) { if (w->documents[i].id == doc_id) { // document is already indexed for this stem flag = 0; break; } else if (w->documents[i].id > doc_id) { break; } } // only add document to list if it's not already in the list if (flag) { w = (indexed_word_p) realloc(w, sizeof(indexed_word_t) + sizeof(doc_t) * (w->nr_docs + 1)); // update pointer to this group (needed after realloc) if (!p) { index->words = w; } else { p->next = w; } // insert document in list memmove(&w->documents[i+1], &w->documents[i], sizeof(doc_t) * (w->nr_docs - i)); w->documents[i].id = doc_id; w->documents[i].tf = 1; w->nr_docs++; } else { // increase counter for number of occurances of this word in this document w->documents[i].tf++; } free(word_stem); } else { // stem is not indexed, add it to index w = (indexed_word_p) malloc(sizeof(indexed_word_t) + sizeof(doc_t)); w->stem = word_stem; w->nr_docs = 1; w->documents[0].id = doc_id; w->documents[0].tf = 1; index->nr_words++; // insert this word in linked list if (!p) { w->next = index->words; index->words = w; } else { w->next = p->next; p->next = w; } } // increase counter for total number of words in this document index->documents[doc_id].nr_words++; // get next word word = strtok(NULL, " "); } free(l); } fclose(f); // finalize computation of TF indexed_word_p w = index->words; while (w) { int i = find_int(&w->documents[0].id, sizeof(doc_t), doc_id, 0, w->nr_docs - 1); if (i >= 0) { w->documents[i].tf /= index->documents[doc_id].nr_words; } w = w->next; } }
/* RETURN VALUE 0 - eof 1 - word found 2 - left bracket 3 - right bracket 4 - stopword found */ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end, FT_WORD *word, FTB_PARAM *param) { byte *doc=*start; uint mwc, length, mbl; param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); param->plusminus=param->pmsign=0; while (doc<end) { for (; doc < end; doc+= mbl) { if (true_word_char(cs,*doc)) break; if (*doc == FTB_RQUOT && param->quot) { param->quot=doc; *start=doc+1; return 3; /* FTB_RBR */ } mbl= my_mbcharlen(cs, *(uchar *)doc); if (!param->quot) { if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) { /* param->prev=' '; */ *start=doc+1; if (*doc == FTB_LQUOT) param->quot=*start; return (*doc == FTB_RBR)+2; } if (param->prev == ' ') { if (*doc == FTB_YES ) { param->yesno=+1; continue; } else if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else if (*doc == FTB_NO ) { param->yesno=-1; continue; } else if (*doc == FTB_INC ) { param->plusminus++; continue; } else if (*doc == FTB_DEC ) { param->plusminus--; continue; } else if (*doc == FTB_NEG ) { param->pmsign=!param->pmsign; continue; } } } param->prev=*doc; param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); param->plusminus=param->pmsign=0; } mwc=length=0; for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1)) if (true_word_char(cs,*doc)) mwc=0; else if (!misc_word_char(*doc) || mwc) break; else mwc++; param->prev='A'; /* be sure *prev is true_word_char */ word->len= (uint)(doc-word->pos) - mwc; if ((param->trunc=(doc<end && *doc == FTB_TRUNC))) doc++; if (((length >= ft_min_word_len && !is_stopword(word->pos, word->len)) || param->trunc) && length < ft_max_word_len) { *start=doc; return 1; } else if (length) /* make sure length > 0 (if start contains spaces only) */ { *start= doc; return 4; } } if (param->quot) { param->quot=*start=doc; return 3; /* FTB_RBR */ } return 0; }