local size_t sentencizer_next(struct mascara *imp, struct mr_token **tks) { struct sentencizer *szr = (struct sentencizer *)imp; struct sentence *sent = &szr->sent; assert(szr->str && "text no set"); sentence_clear(sent); size_t len; const unsigned char *last_period; const unsigned char *str = next_sentence(szr, &len, &last_period); if (!str) { *tks = NULL; return 0; } size_t offset_incr = szr->offset_incr + str - szr->str; struct tokenizer tkr; tokenizer_init(&tkr, szr->vtab); tokenizer_set_text(&tkr.base, str, len, offset_incr); struct mr_token *tk; while (tokenizer_next(&tkr.base, &tk)) { if (tk->str == (const char *)last_period || !sentencizer_reattach_period(sent, tk)) { sentence_add(sent, tk); if (sent->len == MR_MAX_SENTENCE_LEN) { szr->p = (const unsigned char *)tk->str + tk->len; break; } } } *tks = sent->tokens; return sent->len; }
int main (){ FILE *input; input = fopen("text.txt", "r"); char *str = strnew(); sentence_t *currSentence = sentence_create(); text_t *text = text_create(); while (1) { char c = fgetc(input); if (c == EOF || isspace(c) || c == ',' || c == '.' || c == '!' || c == '?' || c == ';') { if (strlen(str) != 0) { word_t *word = word_new(str); sentence_add(currSentence, word); free (str); word_free (word); str = strnew(); } } if (c == EOF || c == '.' || c == '!' || c == '?') { text_add (text, currSentence); sentence_free(currSentence); currSentence = sentence_create(); } if (c == EOF) break; if (isalpha(c)) { c = tolower(c); char *w = stradd(str, c); free (str); str = w; } } fclose (input); FILE *output; output = fopen("result.txt", "w"); input = fopen ("stopwords.txt", "r"); int stopCount, i; fscanf (input, "%d", &stopCount); for (i = 0; i < stopCount; i++) { char s[15]; fscanf (input, "%s", s); fprintf (output, "%s: %d\n", s, text_find (text, s)); } fclose (input); fclose (output); free (str); sentence_free(currSentence); text_free(text); return 0; }
void file_read (const char * txt, text_t * text, sentence_t * sentence) { FILE * file = fopen(txt, "r"); if(file == NULL) return; char c; int len = 0; char word[25] = ""; while((c = fgetc(file)) != EOF) { if(ispunct(c)) { if(c == '.' || c == '!' || c == '?') { if(len == 0) continue; word_add(sentence, word_t_new(word)); sentence_add(text, sentence); sentence = sentence_t_new(); memset(word, 0, 25); len = 0; } continue; } else if(isspace(c)) { if(len == 0) continue; word_add(sentence, word_t_new(word)); memset(word, 0, 25); len = 0; } else { word[len] = c; len++; } } fclose(file); }