예제 #1
0
// generate a result set from two sets of datapoints of which the first set contains all
// datapoints with other datapoints in the buffer zone and the of which the second set
// contains all datapoints without other datapoints in the buffer zone
dataset generateSet(dataset& withNearbyDataset, dataset& standaloneDataset) {
    random_device rd;
    mt19937 rng(rd());
    
    dataset remainingDataset(withNearbyDataset.begin(), withNearbyDataset.end());
    dataset resultSet(standaloneDataset.begin(), standaloneDataset.end());
    
    while (remainingDataset.size() != 0) {
        // create iterator
        dataset::iterator it = remainingDataset.begin();
        
        // generate random index
        uniform_int_distribution<int> uni(0, (int)remainingDataset.size());
        int r = uni(rng);
        // pick random datapoint by advancing the iterator to the random position
        advance(it, r % remainingDataset.size());
        
        
        // add picked datapoint to result list
        resultSet.insert(*it);
        
        // remove all datapoints within buffer zone if still in remaining dataset
        for (dataset::iterator j = it->buffer.begin(); j != it->buffer.end(); ++j) {
            dataset::iterator tmp = remainingDataset.find(*j);
            if (tmp != remainingDataset.end()) {
                remainingDataset.erase(tmp);
            }
        }
        
        // remove picked datapoint from remaining list
        remainingDataset.erase(remainingDataset.find(*it));
    }
    
    return resultSet;
}
예제 #2
0
F foreach_well(const dataset& data, F fn, std::string id_field)
{
    const auto& id = data.at(id_field);

    std::size_t begin_rec = 0, end_rec = 0;
    for (std::size_t i = 0; i < id.size(); ++i) {
        if (id[i] != id[begin_rec] || i == id.size() - 1) {
            if (i == id.size() - 1)
                end_rec = i;

            dataset well;
            std::for_each(data.begin(), data.end(),
                    [&](const std::pair<std::string,
                        std::vector<std::string>>& column)
                    {
                        well[column.first] = std::vector<std::string>(
                            column.second.data() + begin_rec,
                            column.second.data() + end_rec + 1);
                    }
            );
            fn(well);

            begin_rec = i;
        }
        end_rec = i;
    }

    return fn;
}
예제 #3
0
double chunk_evaluate_ioe1(dataset & data, chunkset & chunks) {
    vector<int> human_chk_count;
    vector<int> model_chk_count;
    vector<int> match_chk_count;
    int i;

    int num_chunks = chunks.size();
    for (i = 0; i < num_chunks; i++) {
	human_chk_count.push_back(0);
	model_chk_count.push_back(0);
	match_chk_count.push_back(0);
    }

    dataset::iterator datait;
    for (datait = data.begin(); datait != data.end(); datait++) {
	for (i = 0; i < num_chunks; i++) {
	    human_chk_count[i] += count_chunks_ioe1(1, *datait, chunks[i][0], chunks[i][1]);
	    model_chk_count[i] += count_chunks_ioe1(2, *datait, chunks[i][0], chunks[i][1]);
	    match_chk_count[i] += count_matching_chunks_ioe1(*datait, chunks[i][0], chunks[i][1]);
	}
    }

    printf("\tChunk-based performance evaluation:\n\n");
    printf("\t\tChunk\tManual\tModel\tMatch\tPre.(%)\tRec.(%)\tF1-Measure(%)\n");
    printf("\t\t-----\t------\t-----\t-----\t-------\t-------\t-------------\n");

    int count = 0;
    double pre = 0.0, rec = 0.0, f1 = 0.0;
    double total1_pre = 0.0, total1_rec = 0.0, total1_f1 = 0.0;
    double total2_pre = 0.0, total2_rec = 0.0, total2_f1 = 0.0;
    int total_human = 0, total_model = 0, total_match = 0;

    for (i = 0; i < num_chunks; i++) {
	if (model_chk_count[i] > 0) {
	    pre = (double)match_chk_count[i] / model_chk_count[i];
	    total_model += model_chk_count[i];
	    total1_pre += pre;
	} else {
	    pre = 0.0;
	}

	if (human_chk_count[i] > 0) {
	    rec = (double)match_chk_count[i] / human_chk_count[i];
	    total_human += human_chk_count[i];
	    total1_rec += rec;
	    count++;
	} else {
	    rec = 0.0;
	}

	total_match += match_chk_count[i];

	if (pre + rec > 0) {
	    f1 = (double) 2 * pre * rec / (pre + rec);
	} else {
	    f1 = 0.0;
	}

	printf("\t\t%s\t%d\t%d\t%d\t%6.2f\t%6.2f\t%6.2f\n",
	       chunks[i][2].c_str(), human_chk_count[i], model_chk_count[i], 
	       match_chk_count[i], pre * 100, rec * 100, f1 * 100);
    }

    printf("\t\t-----\t------\t-----\t-----\t-------\t-------\t-------------\n");

    if (count > 0) {
	total1_pre /= count;
	total1_rec /= count;
	if (total1_pre + total1_rec > 0) {
	    total1_f1 = 2 * total1_pre * total1_rec / (total1_pre + total1_rec);
	}
	printf("\t\tAvg1.\t\t\t\t%6.2f\t%6.2f\t%6.2f\n",
	       total1_pre * 100, total1_rec * 100, total1_f1 * 100);
    }
    
    if (total_model > 0) {
	total2_pre = (double)total_match / total_model;
    }
    if (total_human > 0) {
	total2_rec = (double)total_match / total_human;
    }
    if (total2_pre + total2_rec > 0) {
	total2_f1 = 2 * total2_rec * total2_pre / (total2_rec + total2_pre);
    }
    
    printf("\t\tAvg2.\t%d\t%d\t%d\t%6.2f\t%6.2f\t%6.2f\n\n",
	   total_human, total_model, total_match,
	   total2_pre * 100, total2_rec * 100, total2_f1 * 100);

    return total2_f1 * 100;
}
예제 #4
0
// call this to compute precision, recall, and F1-measure
void evaluate(dataset & data, string & chunktype, labelset & labels, chunkset & chunks) {
    map<string, int> lbstr2int;
    map<int, string> lbint2str;
    vector<int> human_lb_count, model_lb_count, human_model_lb_count;

    int i;
    int num_labels = labels.size();
    
    for (i = 0; i < num_labels; i++) {
	lbstr2int.insert(pair<string, int>(labels[i], i));
	lbint2str.insert(pair<int, string>(i, labels[i]));
	
	human_lb_count.push_back(0);
	model_lb_count.push_back(0);
	human_model_lb_count.push_back(0);
    }

    // start to count
    dataset::iterator datait;
    sequence::iterator seqit;    
    for (datait = data.begin(); datait != data.end(); datait++) {
	for (seqit = datait->begin(); seqit != datait->end(); seqit++) {
	    int label = str_2_int(lbstr2int, (*seqit)[seqit->size() - 2]);
	    int model_label = str_2_int(lbstr2int, (*seqit)[seqit->size() - 1]);
	    
	    if (label >= 0 && label < num_labels) {
		human_lb_count[label]++;
	    }
	    
	    if (model_label >= 0 && model_label < num_labels) {
		model_lb_count[model_label]++;
	    }
	    
	    if (label == model_label && label >= 0 && label < num_labels) {
		human_model_lb_count[label]++;
	    }
	}
    }
    
    // print out    
    printf("\tLabel-based performance evaluation:\n\n");
    printf("\t\tLabel\tManual\tModel\tMatch\tPre.(%)\tRec.(%)\tF1-Measure(%)\n");
    printf("\t\t-----\t------\t-----\t-----\t-------\t-------\t-------------\n");

    int count = 0;
    double precision = 0.0, recall = 0.0, f1, total1_pre = 0.0, 
	   total1_rec = 0.0, total1_f1 = 0.0, total2_pre = 0.0, 
	   total2_rec = 0.0, total2_f1 = 0.0;
    int total_human = 0, total_model = 0, total_match = 0;
    
    for (i = 0; i < num_labels; i++) {
	if (model_lb_count[i] > 0) {
	    precision = (double)human_model_lb_count[i] / model_lb_count[i];
	    total_model += model_lb_count[i];
	    total1_pre += precision;
	} else {
	    precision = 0.0;
	}
	if (human_lb_count[i] > 0) {
	    recall = (double)human_model_lb_count[i] / human_lb_count[i];
	    total_human += human_lb_count[i];
	    total1_rec += recall;
	    count++;
	} else {
	    recall = 0.0;
	}
	
	total_match += human_model_lb_count[i];
	
	if (recall + precision > 0) {
	    f1 = (double) 2 * precision * recall / (precision + recall);
	} else {
	    f1 = 0;
	}
	
	char buff[50];
	sprintf(buff, "%d", i);
	string strlabel = int_2_str(lbint2str, i);
	if (strlabel != "") {	
	    sprintf(buff, "%s", strlabel.c_str());
	}
	
	printf("\t\t%s\t%d\t%d\t%d\t%6.2f\t%6.2f\t%6.2f\n",
	      buff, human_lb_count[i], model_lb_count[i], human_model_lb_count[i],
	      precision * 100, recall * 100, f1 * 100);	      
    }
    
    total1_pre /= count;
    total1_rec /= count;
    total1_f1 = 2 * total1_pre * total1_rec / (total1_pre + total1_rec);

    // print the average performance    
    total2_pre = (double)total_match / total_model;
    total2_rec = (double)total_match / total_human;
    total2_f1 = 2 * total2_pre * total2_rec / (total2_pre + total2_rec);
    printf("\t\t-----\t------\t-----\t-----\t-------\t-------\t-------------\n");
    printf("\t\tAvg1.\t\t\t\t%6.2f\t%6.2f\t%6.2f\n", total1_pre * 100, total1_rec * 100,
	    total1_f1 * 100);
    printf("\t\tAvg2.\t%d\t%d\t%d\t%6.2f\t%6.2f\t%6.2f\n\n", total_human, total_model, total_match,
	    total2_pre * 100, total2_rec * 100, total2_f1 * 100);

    if (chunks.size() <= 0) {
	return;
    }

    // chunk based evaluation
    if (chunktype == "IOB1") {
	chunk_evaluate_iob1(data, chunks);
    }
    if (chunktype == "IOB2") {
	chunk_evaluate_iob2(data, chunks);
    }
    if (chunktype == "IOE1") {
	chunk_evaluate_ioe1(data, chunks);
    }
    if (chunktype == "IOE2") {
	chunk_evaluate_ioe2(data, chunks);
    }
}