void within_doc_split(char* dataset, char* src_data, char* dest_data, double prop) { int i; corpus * corp, * dest_corp; corp = read_data(dataset); dest_corp = malloc(sizeof(corpus)); printf("splitting %d docs\n", corp->ndocs); dest_corp->docs = malloc(sizeof(doc) * corp->ndocs); dest_corp->nterms = corp->nterms; dest_corp->ndocs = corp->ndocs; for (i = 0; i < corp->ndocs; i++) split(&(corp->docs[i]), &(dest_corp->docs[i]), prop); write_corpus(dest_corp, dest_data); write_corpus(corp, src_data); }
void write_corpus_seq(corpus_seq_t* c, char* name) { char tmp_string[400]; int n; outlog("writing %d slices to %s (%d total docs)", c->len, name, c->ndocs); sprintf(tmp_string, "%s-seq.dat", name); FILE* seq_file = fopen(tmp_string, "w"); fprintf(seq_file, "%d", c->len); for (n = 0; n < c->len; n++) fprintf(seq_file, " %d", c->corpus[n]->ndocs); fclose(seq_file); corpus_t* flat = collapse_corpus_seq(c); sprintf(tmp_string, "%s-mult.dat", name); write_corpus(flat, tmp_string); }
double Process::nn_dev_test(string to_test,string output,string gold) { time_t now; //also assuming test-file itself is gold file(this must be true with dev file) dev_test_corpus = read_corpus(to_test); each_get_featgen(1); /*************virtual****************/ int token_num = 0; //token number int miss_count = 0; time(&now); cout << "#--Test at " << ctime(&now) << std::flush; for(int i=0;i<dev_test_corpus->size();i++){ DependencyInstance* t = dev_test_corpus->at(i); int length = t->forms->size(); token_num += length - 1; vector<int>* ret = each_test_one(t); /*************virtual****************/ for(int i2=1;i2<length;i2++){ //ignore root if((*ret)[i2] != (*(t->heads))[i2]) miss_count ++; } delete t->heads; t->heads = ret; } time(&now); cout << "#--Finish at " << ctime(&now) << std::flush; write_corpus(dev_test_corpus,output); string ttt; double rate = (double)(token_num-miss_count) / token_num; cout << "Evaluate:" << (token_num-miss_count) << "/" << token_num << "(" << rate << ")" << endl; DependencyEvaluator::evaluate(gold,output,ttt,false); //clear for(int i=0;i<dev_test_corpus->size();i++){ delete dev_test_corpus->at(i); } delete dev_test_corpus; return rate; }
double Process::nn_dev_test(string to_test,string output,string gold,int dev) { time_t now; //also assuming test-file itself is gold file(this must be true with dev file) dev_test_corpus = read_corpus(to_test); dict->prepare_corpus(dev_test_corpus,1); //get those indexes int token_num = 0; //token number int miss_count = 0; time(&now); cout << "#--Test at " << ctime(&now) << std::flush; for(unsigned int i=0;i<dev_test_corpus->size();i++){ DependencyInstance* t = dev_test_corpus->at(i); int length = t->forms->size(); token_num += length - 1; each_test_one(t,dev); /*************virtual****************/ for(int i2=1;i2<length;i2++){ //ignore root if((*(t->predict_heads))[i2] != (*(t->heads))[i2]) miss_count ++; } } time(&now); cout << "#--Finish at " << ctime(&now) << std::flush; dict->prepare_deprel_str(dev_test_corpus); //get deprel's strings write_corpus(dev_test_corpus,output); string ttt; double rate = (double)(token_num-miss_count) / token_num; cout << "Evaluate:" << (token_num-miss_count) << "/" << token_num << "(" << rate << ")" << endl; DependencyEvaluator::evaluate(gold,output,ttt,hp->CONF_labeled); //clear for(unsigned int i=0;i<dev_test_corpus->size();i++){ delete dev_test_corpus->at(i); } delete dev_test_corpus; return rate; }