예제 #1
0
파일: nlp.c 프로젝트: kpo99/projects
void text_split(text_t * self){
char * part = strtok(self->text,".;?");
while (part!= NULL){
    list_sentence_add(self->sentences,sentence_new(part));
    part = strtok(NULL,".;");
}
}
예제 #2
0
void text_divide(text_t * self){
char * part = strtok(self->text,".?!");
while (part != NULL) {
		list_push_back(self->sentencesList,sentence_new(part));
		part = strtok(NULL,".?!");
	}
}
예제 #3
0
static int read_conllx(FILE* fp)
{
    int num_sent = 0;
    unsigned int line_num = 0;
    char buff[BUF_SIZE];
    const char* sep = "\t";
    char *t, *line, *tofree;
    size_t len;

    const char** seq = xmalloc(sizeof(char *) * CONLLX_TOKEN_NUM_FIELDS);
    struct blocks* blocks = new_blocks(256);
    struct sentence* sent = sentence_new();

    while (fgets(buff, sizeof(buff), fp)) {
        line_num++;
        if (buff[0] == '\n') {
            latex_print_dep_tree(sent);
            sentence_destroy(sent);
            sent = sentence_new();
            num_sent++;
            continue;
        }
        len = strnlen(buff, BUF_SIZE);
        tofree = line = strndup(buff, len-1);
        blocks_add_elem(blocks, tofree);

        int i = 0;
        while ((t = strsep(&line, sep)) != NULL) {
            seq[i] = t;
            i++;
        }
        sentence_add_token(sent, token_new(seq, CONLLX_TOKEN_NUM_FIELDS));
    }
    destroy_blocks(blocks);
    sentence_destroy(sent);
    free(seq);
    fprintf(stderr, "INFO: Number of sentences = %d\n", num_sent);

    return 0;
}
예제 #4
0
파일: lib.c 프로젝트: mohnkhan/summarizer
status_t
parse_article(const char* file_name, lang_t* lang, article_t* article)
{
    string_t    word, word_core, word_stem;
    sentence_t* sentence;
    word_t*     word_entry;
    stream_t*   stream = &article->stream;
    bool_t      is_new, is_para_end = SMRZR_FALSE;

    PROF_START;

    if(SMRZR_OK != stream_create(file_name, stream))
        ERROR_RET;

    while(!STREAM_END(stream)) {

        STREAM_FIND_WORD(stream);

        if(STREAM_END(stream)) break;

        sentence = sentence_new(&article->sentences, article->stream.curr);
        assert(NULL != sentence);

        if(SMRZR_TRUE == is_para_end) {
            sentence->is_para_begin = SMRZR_TRUE;
            is_para_end = SMRZR_FALSE;
        }

        while(!STREAM_END(stream)) {

            STREAM_GET_WORD(stream, word, is_para_end);
            
            sentence->num_words++;

            if(NULL == (word_core = get_word_core(&article->stack, lang, word)))
                ERROR_RET;

            if(NULL == array_search(lang->exclude, word_core, comp_strings)) {

                if(NULL == (word_stem = get_word_stem(&article->stack, lang,
                                                      word_core, SMRZR_TRUE)))
                    ERROR_RET;

                if(NULL == (word_entry = array_search_or_alloc(&article->words,
                                        word_stem, comp_word_by_stem, &is_new)))
                    ERROR_RET;

                if(SMRZR_TRUE == is_new) {
                    word_entry->num_occ = 1;
                    word_entry->stem = word_stem;
                } else {
                    ++(word_entry->num_occ);
                    array_pop_free(article->stack, word_stem);
                }
            } else {
                array_pop_free(article->stack, word_core);
            }

            if(end_of_line(lang, word)) {
                sentence->end = word + strlen(word);
                article->num_words += sentence->num_words;
                break;
            }
        }
    }

    PROF_END("article parsing");

    /*fprintf(stdout, "Number of sentences - %lu\n", ARR_SZ(article->sentences));
    fprintf(stdout, "Number of words - %lu\n", ARR_SZ(article->words));*/

    return(SMRZR_OK);
}