Пример #1
0
local size_t sentencizer_next(struct mascara *imp, struct mr_token **tks)
{
   struct sentencizer *szr = (struct sentencizer *)imp;
   struct sentence *sent = &szr->sent;

   assert(szr->str && "text no set");
   sentence_clear(sent);

   size_t len;
   const unsigned char *last_period;
   const unsigned char *str = next_sentence(szr, &len, &last_period);
   if (!str) {
      *tks = NULL;
      return 0;
   }
   size_t offset_incr = szr->offset_incr + str - szr->str;

   struct tokenizer tkr;
   tokenizer_init(&tkr, szr->vtab);
   tokenizer_set_text(&tkr.base, str, len, offset_incr);

   struct mr_token *tk;
   while (tokenizer_next(&tkr.base, &tk)) {
      if (tk->str == (const char *)last_period ||
         !sentencizer_reattach_period(sent, tk)) {
         sentence_add(sent, tk);
         if (sent->len == MR_MAX_SENTENCE_LEN) {
            szr->p = (const unsigned char *)tk->str + tk->len;
            break;
         }
      }
   }
   *tks = sent->tokens;
   return sent->len;
}
Пример #2
0
int main (){
    FILE *input;
    input = fopen("text.txt", "r");

    char *str = strnew();

    sentence_t *currSentence = sentence_create();
    text_t *text = text_create();

    while (1) {
        char c = fgetc(input);
        if (c == EOF || isspace(c) || c == ',' || c == '.' || c == '!' || c == '?' || c == ';') {
            if (strlen(str) != 0) {
                word_t *word = word_new(str);
                sentence_add(currSentence, word);
                free (str);
                word_free (word);
                str = strnew();
            }
        }
        if (c == EOF || c == '.' || c == '!' || c == '?') {
            text_add (text, currSentence);
            sentence_free(currSentence);
            currSentence = sentence_create();
        }
        if (c == EOF) break;
        if (isalpha(c)) {
            c = tolower(c);
            char *w = stradd(str, c);
            free (str);
            str = w;
        }
    }
    fclose (input);
    FILE *output;
    output = fopen("result.txt", "w");
    input = fopen ("stopwords.txt", "r");

    int stopCount, i;
    fscanf (input, "%d", &stopCount);
    for (i = 0; i < stopCount; i++) {
        char s[15];
        fscanf (input, "%s", s);
        fprintf (output, "%s: %d\n", s, text_find (text, s));
    }

    fclose (input);
    fclose (output);
    free (str);
    sentence_free(currSentence);
    text_free(text);
    return 0;
}
Пример #3
0
void file_read (const char * txt, text_t * text, sentence_t * sentence)
{
    FILE * file = fopen(txt, "r");
    if(file == NULL)
        return;
    char c;
    int len = 0;
    char word[25] = "";
    while((c = fgetc(file)) != EOF)
    {
       if(ispunct(c))
        {
            if(c == '.' || c == '!' || c == '?')
            {
                if(len == 0)
                    continue;
                word_add(sentence, word_t_new(word));
                sentence_add(text, sentence);
                sentence = sentence_t_new();
                memset(word, 0, 25);
                len = 0;
            }
            continue;
        }
        else if(isspace(c))
        {
            if(len == 0)
                continue;
            word_add(sentence, word_t_new(word));
            memset(word, 0, 25);
            len = 0;
        }
        else
        {
            word[len] = c;
            len++;
        }
    }
    fclose(file);
}