示例#1
0
文件: estimate.c 项目: Rygbee/ctm-c
void within_doc_split(char* dataset, char* src_data, char* dest_data, double prop)
{
    int i;
    corpus * corp, * dest_corp;

    corp = read_data(dataset);
    dest_corp = malloc(sizeof(corpus));
    printf("splitting %d docs\n", corp->ndocs);
    dest_corp->docs = malloc(sizeof(doc) * corp->ndocs);
    dest_corp->nterms = corp->nterms;
    dest_corp->ndocs = corp->ndocs;
    for (i = 0; i < corp->ndocs; i++)
        split(&(corp->docs[i]), &(dest_corp->docs[i]), prop);
    write_corpus(dest_corp, dest_data);
    write_corpus(corp, src_data);
}
示例#2
0
文件: data.c 项目: Arttii/dtm
void write_corpus_seq(corpus_seq_t* c, char* name)
{
    char tmp_string[400];
    int n;

    outlog("writing %d slices to %s (%d total docs)", c->len, name, c->ndocs);
    sprintf(tmp_string, "%s-seq.dat", name);
    FILE* seq_file = fopen(tmp_string, "w");
    fprintf(seq_file, "%d", c->len);
    for (n = 0; n < c->len; n++)
        fprintf(seq_file, " %d", c->corpus[n]->ndocs);
    fclose(seq_file);

    corpus_t* flat = collapse_corpus_seq(c);
    sprintf(tmp_string, "%s-mult.dat", name);
    write_corpus(flat, tmp_string);
}
示例#3
0
double Process::nn_dev_test(string to_test,string output,string gold)
{
	time_t now;
	//also assuming test-file itself is gold file(this must be true with dev file)
	dev_test_corpus = read_corpus(to_test);
	each_get_featgen(1);	/*************virtual****************/
	int token_num = 0;	//token number
	int miss_count = 0;
	time(&now);
	cout << "#--Test at " << ctime(&now) << std::flush;
	for(int i=0;i<dev_test_corpus->size();i++){
		DependencyInstance* t = dev_test_corpus->at(i);
		int length = t->forms->size();
		token_num += length - 1;
		vector<int>* ret = each_test_one(t);		/*************virtual****************/
		for(int i2=1;i2<length;i2++){	//ignore root
			if((*ret)[i2] != (*(t->heads))[i2])
				miss_count ++;
		}
		delete t->heads;
		t->heads = ret;
	}
	time(&now);
	cout << "#--Finish at " << ctime(&now) << std::flush;
	write_corpus(dev_test_corpus,output);
	string ttt;
	double rate = (double)(token_num-miss_count) / token_num;
	cout << "Evaluate:" << (token_num-miss_count) << "/" << token_num
			<< "(" << rate << ")" << endl;
	DependencyEvaluator::evaluate(gold,output,ttt,false);

	//clear
	for(int i=0;i<dev_test_corpus->size();i++){
		delete dev_test_corpus->at(i);
	}
	delete dev_test_corpus;
	return rate;
}
示例#4
0
double Process::nn_dev_test(string to_test,string output,string gold,int dev)
{
	time_t now;
	//also assuming test-file itself is gold file(this must be true with dev file)
	dev_test_corpus = read_corpus(to_test);
	dict->prepare_corpus(dev_test_corpus,1);	//get those indexes
	int token_num = 0;	//token number
	int miss_count = 0;
	time(&now);
	cout << "#--Test at " << ctime(&now) << std::flush;
	for(unsigned int i=0;i<dev_test_corpus->size();i++){
		DependencyInstance* t = dev_test_corpus->at(i);
		int length = t->forms->size();
		token_num += length - 1;
		each_test_one(t,dev);		/*************virtual****************/
		for(int i2=1;i2<length;i2++){	//ignore root
			if((*(t->predict_heads))[i2] != (*(t->heads))[i2])
				miss_count ++;
		}
	}
	time(&now);
	cout << "#--Finish at " << ctime(&now) << std::flush;
	dict->prepare_deprel_str(dev_test_corpus);	//get deprel's strings
	write_corpus(dev_test_corpus,output);
	string ttt;
	double rate = (double)(token_num-miss_count) / token_num;
	cout << "Evaluate:" << (token_num-miss_count) << "/" << token_num
			<< "(" << rate << ")" << endl;
	DependencyEvaluator::evaluate(gold,output,ttt,hp->CONF_labeled);

	//clear
	for(unsigned int i=0;i<dev_test_corpus->size();i++){
		delete dev_test_corpus->at(i);
	}
	delete dev_test_corpus;
	return rate;
}