void text_split(text_t * self){ char * part = strtok(self->text,".;?"); while (part!= NULL){ list_sentence_add(self->sentences,sentence_new(part)); part = strtok(NULL,".;"); } }
void text_divide(text_t * self){ char * part = strtok(self->text,".?!"); while (part != NULL) { list_push_back(self->sentencesList,sentence_new(part)); part = strtok(NULL,".?!"); } }
static int read_conllx(FILE* fp) { int num_sent = 0; unsigned int line_num = 0; char buff[BUF_SIZE]; const char* sep = "\t"; char *t, *line, *tofree; size_t len; const char** seq = xmalloc(sizeof(char *) * CONLLX_TOKEN_NUM_FIELDS); struct blocks* blocks = new_blocks(256); struct sentence* sent = sentence_new(); while (fgets(buff, sizeof(buff), fp)) { line_num++; if (buff[0] == '\n') { latex_print_dep_tree(sent); sentence_destroy(sent); sent = sentence_new(); num_sent++; continue; } len = strnlen(buff, BUF_SIZE); tofree = line = strndup(buff, len-1); blocks_add_elem(blocks, tofree); int i = 0; while ((t = strsep(&line, sep)) != NULL) { seq[i] = t; i++; } sentence_add_token(sent, token_new(seq, CONLLX_TOKEN_NUM_FIELDS)); } destroy_blocks(blocks); sentence_destroy(sent); free(seq); fprintf(stderr, "INFO: Number of sentences = %d\n", num_sent); return 0; }
status_t parse_article(const char* file_name, lang_t* lang, article_t* article) { string_t word, word_core, word_stem; sentence_t* sentence; word_t* word_entry; stream_t* stream = &article->stream; bool_t is_new, is_para_end = SMRZR_FALSE; PROF_START; if(SMRZR_OK != stream_create(file_name, stream)) ERROR_RET; while(!STREAM_END(stream)) { STREAM_FIND_WORD(stream); if(STREAM_END(stream)) break; sentence = sentence_new(&article->sentences, article->stream.curr); assert(NULL != sentence); if(SMRZR_TRUE == is_para_end) { sentence->is_para_begin = SMRZR_TRUE; is_para_end = SMRZR_FALSE; } while(!STREAM_END(stream)) { STREAM_GET_WORD(stream, word, is_para_end); sentence->num_words++; if(NULL == (word_core = get_word_core(&article->stack, lang, word))) ERROR_RET; if(NULL == array_search(lang->exclude, word_core, comp_strings)) { if(NULL == (word_stem = get_word_stem(&article->stack, lang, word_core, SMRZR_TRUE))) ERROR_RET; if(NULL == (word_entry = array_search_or_alloc(&article->words, word_stem, comp_word_by_stem, &is_new))) ERROR_RET; if(SMRZR_TRUE == is_new) { word_entry->num_occ = 1; word_entry->stem = word_stem; } else { ++(word_entry->num_occ); array_pop_free(article->stack, word_stem); } } else { array_pop_free(article->stack, word_core); } if(end_of_line(lang, word)) { sentence->end = word + strlen(word); article->num_words += sentence->num_words; break; } } } PROF_END("article parsing"); /*fprintf(stdout, "Number of sentences - %lu\n", ARR_SZ(article->sentences)); fprintf(stdout, "Number of words - %lu\n", ARR_SZ(article->words));*/ return(SMRZR_OK); }