コード例 #1
0
ファイル: ft_parser.c プロジェクト: Baoxiyi-Github/Mysql
uchar ft_simple_get_word(const CHARSET_INFO *cs, uchar **start,
                         const uchar *end,
                         FT_WORD *word, my_bool skip_stopwords)
{
  uchar *doc= *start;
  uint mwc, length;
  int mbl;
  int ctype;
  DBUG_ENTER("ft_simple_get_word");

  do
  {
    for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
    {
      if (doc >= end)
        DBUG_RETURN(0);
      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
      if (true_word_char(ctype, *doc))
        break;
    }

    mwc= length= 0;
    for (word->pos= doc; doc < end; length++,
         doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
    {
      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
      if (true_word_char(ctype, *doc))
        mwc= 0;
      else if (!misc_word_char(*doc) || mwc)
        break;
      else
        mwc++;
    }

    word->len= (uint)(doc-word->pos) - mwc;

    if (skip_stopwords == FALSE ||
        (length >= ft_min_word_len && length < ft_max_word_len &&
         !is_stopword((char*) word->pos, word->len)))
    {
      *start= doc;
      DBUG_RETURN(1);
    }
  } while (doc < end);
  DBUG_RETURN(0);
}
コード例 #2
0
ファイル: ft_parser.c プロジェクト: isleon/Jaxer
byte ft_simple_get_word(CHARSET_INFO *cs, byte **start, const byte *end,
                        FT_WORD *word, my_bool skip_stopwords)
{
  byte *doc= *start;
  uint mwc, length, mbl;
  DBUG_ENTER("ft_simple_get_word");

  do
  {
    for (;; doc+= (mbl ? mbl : 1))
    {
      if (doc >= end) DBUG_RETURN(0);
      if (true_word_char(cs, *doc)) break;
      mbl= my_mbcharlen(cs, *(uchar *)doc);
    }

    mwc= length= 0;
    for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1))
      if (true_word_char(cs,*doc))
        mwc= 0;
      else if (!misc_word_char(*doc) || mwc)
        break;
      else
        mwc++;

    word->len= (uint)(doc-word->pos) - mwc;

    if (skip_stopwords == FALSE ||
        (length >= ft_min_word_len && length < ft_max_word_len &&
         !is_stopword(word->pos, word->len)))
    {
      *start= doc;
      DBUG_RETURN(1);
    }
  } while (doc < end);
  DBUG_RETURN(0);
}
コード例 #3
0
/*
  RETURN VALUE
  0 - eof
  1 - word found
  2 - left bracket
  3 - right bracket
  4 - stopword found
*/
uchar ft_get_word(CHARSET_INFO *cs, const uchar **start, const uchar *end,
                  FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
{
  const uchar *doc=*start;
  int ctype;
  uint mwc, length;
  int mbl;

  param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
  param->weight_adjust= param->wasign= 0;
  param->type= FT_TOKEN_EOF;

  while (doc<end)
  {
    for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
    {
      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
      if (true_word_char(ctype, *doc))
        break;
      if (*doc == FTB_RQUOT && param->quot)
      {
        *start=doc+1;
        param->type= FT_TOKEN_RIGHT_PAREN;
        goto ret;
      }
      if (!param->quot)
      {
        if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
        {
          /* param->prev=' '; */
          *start=doc+1;
          if (*doc == FTB_LQUOT)
            param->quot= (char*) 1;
          param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN);
          goto ret;
        }
        if (param->prev == ' ')
        {
          if (*doc == FTB_YES ) { param->yesno=+1;    continue; } else
          if (*doc == FTB_EGAL) { param->yesno= 0;    continue; } else
          if (*doc == FTB_NO  ) { param->yesno=-1;    continue; } else
          if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else
          if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else
          if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; }
        }
      }
      param->prev=*doc;
      param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
      param->weight_adjust= param->wasign= 0;
    }

    mwc=length=0;
    for (word->pos= doc; doc < end; length++,
         doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
    {
      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
      if (true_word_char(ctype, *doc))
        mwc=0;
      else if (!misc_word_char(*doc) || mwc)
        break;
      else
        mwc++;
    }
    param->prev='A'; /* be sure *prev is true_word_char */
    word->len= (uint)(doc-word->pos) - mwc;
    if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
      doc++;

    if (((length >= ft_min_word_len && !is_stopword((char*) word->pos,
                                                    word->len))
         || param->trunc) && length < ft_max_word_len)
    {
      *start=doc;
      param->type= FT_TOKEN_WORD;
      goto ret;
    }
    else if (length) /* make sure length > 0 (if start contains spaces only) */
    {
      *start= doc;
      param->type= FT_TOKEN_STOPWORD;
      goto ret;
    }
  }
  if (param->quot)
  {
    *start= doc;
    param->type= 3; /* FT_RBR */
    goto ret;
  }
ret:
  return param->type;
}
コード例 #4
0
ファイル: index.c プロジェクト: tunefish/I2A-Project3
/*
 * Parses a file and adds its words to the index
 */
void parse_file_for_index(index_p index, char *file) {
    // open file or print error message
    FILE *f = fopen(file, "r");
    if (!f) {
        printf("Cannot open %s!\nIndex not updated.\n", file);
        return;
    }

    // document id = index of document in list of all documents in filebase (alphabetically ordered)
    int doc_id = find_str(&index->documents[0].name, sizeof(indexed_document_t), file, 0, index->nr_docs-1);

    if (doc_id < 0) {
        printf("Error: %s is not in the filebase!\n", file);
        return;
    }

    char *l;
    while ((l = read_line(f))) {
        // turn non alpha characters into spaces
        nonalpha_to_space(l);

        char *word = strtok(l, " ");
        while (word) {
            // ignore stopwords
            if (is_stopword(word)) {
                word = strtok(NULL, " ");
                continue;
            }

            char *word_stem = stem(word);

            if (!strlen(word_stem)) {
                word = strtok(NULL, " ");
                continue;
            }

            // insert document into index / add new stem to index
            indexed_word_p w = index->words;    // current word
            indexed_word_p p = NULL;            // previous word
            int flag = 0;
            while (w && !flag) {
                int cmp = strcmp(w->stem, word_stem);
                if (!cmp) {
                    // stem is already indexed
                    flag = 1;
                    break;
                } else if (0 < cmp) {
                    // stem not indexed yet
                    flag = 2;
                    break;
                }

                p = w;
                w = w->next;
            }

            if (flag == 1) {
                // stem indexed, add document to list
                int i;
                for (i = 0; i < w->nr_docs; i++) {
                    if (w->documents[i].id == doc_id) {
                        // document is already indexed for this stem
                        flag = 0;
                        break;
                    } else if (w->documents[i].id > doc_id) {
                        break;
                    }
                }

                // only add document to list if it's not already in the list
                if (flag) {
                    w = (indexed_word_p) realloc(w, sizeof(indexed_word_t) + sizeof(doc_t) * (w->nr_docs + 1));

                    // update pointer to this group (needed after realloc)
                    if (!p) {
                        index->words = w;
                    } else {
                        p->next = w;
                    }

                    // insert document in list
                    memmove(&w->documents[i+1], &w->documents[i], sizeof(doc_t) * (w->nr_docs - i));
                    w->documents[i].id = doc_id;
                    w->documents[i].tf = 1;
                    w->nr_docs++;
                } else {
                    // increase counter for number of occurances of this word in this document
                    w->documents[i].tf++;
                }

                free(word_stem);
            } else {
                // stem is not indexed, add it to index
                w = (indexed_word_p) malloc(sizeof(indexed_word_t) + sizeof(doc_t));
                w->stem = word_stem;
                w->nr_docs = 1;
                w->documents[0].id = doc_id;
                w->documents[0].tf = 1;

                index->nr_words++;

                // insert this word in linked list
                if (!p) {
                    w->next = index->words;
                    index->words = w;
                } else {
                    w->next = p->next;
                    p->next = w;
                }
            }

            // increase counter for total number of words in this document
            index->documents[doc_id].nr_words++;

            // get next word
            word = strtok(NULL, " ");
        }

        free(l);
    }

    fclose(f);

    // finalize computation of TF
    indexed_word_p w = index->words;
    while (w) {

        int i = find_int(&w->documents[0].id, sizeof(doc_t), doc_id, 0, w->nr_docs - 1);

        if (i >= 0) {
            w->documents[i].tf /= index->documents[doc_id].nr_words;
        }

        w = w->next;
    }
}
コード例 #5
0
ファイル: ft_parser.c プロジェクト: isleon/Jaxer
/*
  RETURN VALUE
  0 - eof
  1 - word found
  2 - left bracket
  3 - right bracket
  4 - stopword found
*/
byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end,
                 FT_WORD *word, FTB_PARAM *param)
{
  byte *doc=*start;
  uint mwc, length, mbl;

  param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
  param->plusminus=param->pmsign=0;

  while (doc<end)
  {
    for (; doc < end; doc+= mbl)
    {
      if (true_word_char(cs,*doc)) break;
      if (*doc == FTB_RQUOT && param->quot)
      {
        param->quot=doc;
        *start=doc+1;
        return 3; /* FTB_RBR */
      }
      mbl= my_mbcharlen(cs, *(uchar *)doc);
      if (!param->quot)
      {
        if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
        {
          /* param->prev=' '; */
          *start=doc+1;
          if (*doc == FTB_LQUOT) param->quot=*start;
          return (*doc == FTB_RBR)+2;
        }
        if (param->prev == ' ')
        {
          if (*doc == FTB_YES ) { param->yesno=+1;    continue; } else
          if (*doc == FTB_EGAL) { param->yesno= 0;    continue; } else
          if (*doc == FTB_NO  ) { param->yesno=-1;    continue; } else
          if (*doc == FTB_INC ) { param->plusminus++; continue; } else
          if (*doc == FTB_DEC ) { param->plusminus--; continue; } else
          if (*doc == FTB_NEG ) { param->pmsign=!param->pmsign; continue; }
        }
      }
      param->prev=*doc;
      param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
      param->plusminus=param->pmsign=0;
    }

    mwc=length=0;
    for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1))
      if (true_word_char(cs,*doc))
        mwc=0;
      else if (!misc_word_char(*doc) || mwc)
        break;
      else
        mwc++;

    param->prev='A'; /* be sure *prev is true_word_char */
    word->len= (uint)(doc-word->pos) - mwc;
    if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
      doc++;

    if (((length >= ft_min_word_len && !is_stopword(word->pos, word->len))
         || param->trunc) && length < ft_max_word_len)
    {
      *start=doc;
      return 1;
    }
    else if (length) /* make sure length > 0 (if start contains spaces only) */
    {
      *start= doc;
      return 4;
    }
  }
  if (param->quot)
  {
    param->quot=*start=doc;
    return 3; /* FTB_RBR */
  }
  return 0;
}