Exemplo n.º 1
0
F foreach_well(const dataset& data, F fn, std::string id_field)
{
    const auto& id = data.at(id_field);

    std::size_t begin_rec = 0, end_rec = 0;
    for (std::size_t i = 0; i < id.size(); ++i) {
        if (id[i] != id[begin_rec] || i == id.size() - 1) {
            if (i == id.size() - 1)
                end_rec = i;

            dataset well;
            std::for_each(data.begin(), data.end(),
                    [&](const std::pair<std::string,
                        std::vector<std::string>>& column)
                    {
                        well[column.first] = std::vector<std::string>(
                            column.second.data() + begin_rec,
                            column.second.data() + end_rec + 1);
                    }
            );
            fn(well);

            begin_rec = i;
        }
        end_rec = i;
    }

    return fn;
}
Exemplo n.º 2
0
int model::load_training_data(const dataset &ds)
{
	int nrow, ncol;

        nrow = ds.ins_num();
        ncol = ds.fea_num();

	if (nrow <= 0 || ncol < 1) {
		ULIB_FATAL("invalid training data dimensions");
		return -1;
	}
        if (nrow > FLAGS_max_num_examples)
                nrow = FLAGS_max_num_examples;

	if (alloc_training_data(nrow, ncol)) {
		ULIB_FATAL("couldn't allocate training data");
		return -1;
	}

        double tavg = 0;
        double tvar = 0;
	for (int i = 0; i < nrow; ++i) {
		double t = ds.get_tgv(i);
                tavg += t;
                tvar += t*t;
		gsl_vector_set(_tv, i, t);
                for (int j = 0; j < ncol; ++j)
                        gsl_matrix_set(_fm, i, j, ds.get_fea(i, j));
	}
        _t_avg = tavg/nrow;
        _t_std = sqrt(tvar/nrow - _t_avg*_t_avg);

        return 0;
}
Exemplo n.º 3
0
void slam::slam_data<ControlModel, ObservationModel>
::add_dataset (const dataset<ControlModel, ObservationModel>& data,
               const typename ControlModel::builder& control_model_builder,
               const typename ObservationModel::builder& obs_model_builder) {
    
    using namespace boost::adaptors;

    auto add_observations = [&](timestep_type t) {
        for (const auto& obs : values(data.observations_at(t))) {
            add_observation (obs.id, obs_model_builder(obs.observation));
        }
    };
    
    add_observations (current_timestep());
    timestep (current_timestep());
    
    while (current_timestep() < data.current_timestep()) {
        add_control (control_model_builder (data.control(current_timestep()),
                                            data.timedelta(current_timestep())));
        add_observations (current_timestep());
        timestep (current_timestep());
    }
    
    completed();
}
Exemplo n.º 4
0
void silhouette_ksearch::process(const dataset & p_data, silhouette_ksearch_data & p_result) {
    if (m_kmax > p_data.size()) {
        throw std::invalid_argument("K max value '" + std::to_string(m_kmax) + 
            "' should be bigger than amount of objects '" + std::to_string(p_data.size()) + "' in input data.");
    }

    p_result.scores().reserve(m_kmax - m_kmin);

    for (std::size_t k = m_kmin; k < m_kmax; k++) {
        cluster_sequence clusters;
        m_allocator->allocate(k, p_data, clusters);

        if (clusters.size() != k) {
            p_result.scores().push_back(std::nan("1"));
            continue;
        }
        
        silhouette_data result;
        silhouette().process(p_data, clusters, result);

        const double score = std::accumulate(result.get_score().begin(), result.get_score().end(), (double) 0.0) / result.get_score().size();
        p_result.scores().push_back(score);

        if (score > p_result.get_score()) {
            p_result.set_amount(k);
            p_result.set_score(score);
        }
    }
}
Exemplo n.º 5
0
// generate a result set from two sets of datapoints of which the first set contains all
// datapoints with other datapoints in the buffer zone and the of which the second set
// contains all datapoints without other datapoints in the buffer zone
dataset generateSet(dataset& withNearbyDataset, dataset& standaloneDataset) {
    random_device rd;
    mt19937 rng(rd());
    
    dataset remainingDataset(withNearbyDataset.begin(), withNearbyDataset.end());
    dataset resultSet(standaloneDataset.begin(), standaloneDataset.end());
    
    while (remainingDataset.size() != 0) {
        // create iterator
        dataset::iterator it = remainingDataset.begin();
        
        // generate random index
        uniform_int_distribution<int> uni(0, (int)remainingDataset.size());
        int r = uni(rng);
        // pick random datapoint by advancing the iterator to the random position
        advance(it, r % remainingDataset.size());
        
        
        // add picked datapoint to result list
        resultSet.insert(*it);
        
        // remove all datapoints within buffer zone if still in remaining dataset
        for (dataset::iterator j = it->buffer.begin(); j != it->buffer.end(); ++j) {
            dataset::iterator tmp = remainingDataset.find(*j);
            if (tmp != remainingDataset.end()) {
                remainingDataset.erase(tmp);
            }
        }
        
        // remove picked datapoint from remaining list
        remainingDataset.erase(remainingDataset.find(*it));
    }
    
    return resultSet;
}
Exemplo n.º 6
0
void experiment_datasets::set_train_test_pairs(const dataset & train, const dataset & test, int pair_num)
{
	shared_ptr<dataset> p_test_set(test.clone());

	shared_ptr<dataset> p_train_set(train.clone());

	train_test_pairs.erase(train_test_pairs.begin(),train_test_pairs.end());

	for (int i = 0; i < pair_num; i++)
	{
		train_test_pairs.push_back(train_test_pair(p_train_set,p_test_set));
	}

}
Exemplo n.º 7
0
void kmeans::update_clusters(const dataset & p_centers, cluster_sequence & p_clusters) {
    const dataset & data = *m_ptr_data;

    p_clusters.clear();
    p_clusters.resize(p_centers.size());

    /* fill clusters again in line with centers. */
    if (m_ptr_indexes->empty()) {
        std::vector<std::size_t> winners(data.size(), 0);
        parallel_for(std::size_t(0), data.size(), [this, &p_centers, &winners](std::size_t p_index) {
            assign_point_to_cluster(p_index, p_centers, winners);
        });

        for (std::size_t index_point = 0; index_point < winners.size(); index_point++) {
            const std::size_t suitable_index_cluster = winners[index_point];
            p_clusters[suitable_index_cluster].push_back(index_point);
        }
    }
    else {
        /* This part of code is used by X-Means and in case of parallel implementation of this part in scope of X-Means
           performance is slightly reduced. Experiments has been performed our implementation and Intel TBB library. 
           But in K-Means case only - it works perfectly and increase performance. */
        std::vector<std::size_t> winners(data.size(), 0);
        parallel_for_each(*m_ptr_indexes, [this, &p_centers, &winners](std::size_t p_index) {
            assign_point_to_cluster(p_index, p_centers, winners);
        });

        for (std::size_t index_point : *m_ptr_indexes) {
            const std::size_t suitable_index_cluster = winners[index_point];
            p_clusters[suitable_index_cluster].push_back(index_point);
        }
    }

    erase_empty_clusters(p_clusters);
}
Exemplo n.º 8
0
vector<vector<int>> random_shuffer_dataset_splitter ::split_impl(const dataset& data) const
{
	vector<vector<int>> batch_ids(batch_num);

	int sample_num = data.get_sample_num();
	vector<int> temp(sample_num);
	for (int i = 0;i<sample_num;i++)
		temp[i] = i;

	std::random_shuffle ( temp.begin(), temp.end() );


	int batch_size = ceil(float(sample_num)/batch_num);

	for (int i = 0;i<batch_num;i++)
	{
		int cur_batch_size = batch_size;
		if (i == batch_num-1)
			cur_batch_size = sample_num - (batch_num-1)*batch_size;
		vector<int> cur_batch_id(cur_batch_size);

		for (int j = 0;j<cur_batch_size;j++)
			cur_batch_id[j] = temp[i*batch_size + j];

		batch_ids[i] = cur_batch_id;

	}

	return batch_ids;
}
Exemplo n.º 9
0
inline typename boost::enable_if<is_multi_array<T>, void>::type
write_dataset(dataset& dset, T const& value)
{
    typedef typename T::element value_type;
    hid_t type_id = ctype<value_type>::hid();
    dset.write(type_id, value.origin());
}
Exemplo n.º 10
0
typename boost::enable_if<is_multi_array<T>, void>::type
read_dataset(dataset & data_set, T & array)
{
    const int array_rank = T::dimensionality;
    typedef typename T::element value_type;

    // --- use temporary dataspace object to get the shape of the dataset
    dataspace file_space(data_set);
    if (!(file_space.rank() == array_rank))
        H5XX_THROW("dataset \"" + get_name(data_set) + "\" and target array have mismatching dimensions");

    boost::array<hsize_t, array_rank> file_dims = file_space.extents<array_rank>();

    // --- clear array - TODO check if this feature is necessary/wanted
    boost::array<size_t, array_rank> array_zero;
    array_zero.assign(0);
    array.resize(array_zero);

    // --- resize array to match the dataset - TODO check if this feature is necessary/wanted
    boost::array<size_t, array_rank> array_shape;
    std::copy(file_dims.begin(), file_dims.begin() + array_rank, array_shape.begin());
    array.resize(array_shape);

    hid_t mem_space_id = H5S_ALL;
    hid_t file_space_id = H5S_ALL;
    hid_t xfer_plist_id = H5P_DEFAULT;

    data_set.read(ctype<value_type>::hid(), array.origin(), mem_space_id, file_space_id, xfer_plist_id);
}
Exemplo n.º 11
0
inline typename boost::enable_if<is_multi_array<T>, void>::type
write_dataset(dataset& dset, T const& value, dataspace const& memspace, dataspace const& filespace)
{
    typedef typename T::element value_type;
    hid_t type_id = ctype<value_type>::hid();
    hid_t mem_space_id = memspace.hid(); //H5S_ALL;
    hid_t file_space_id = filespace.hid();
    hid_t xfer_plist_id = H5P_DEFAULT;
    dset.write(type_id, value.origin(), mem_space_id, file_space_id, xfer_plist_id);
}
Exemplo n.º 12
0
double kmedians::update_medians(cluster_sequence & clusters, dataset & medians) {
    const dataset & data = *m_ptr_data;
    const std::size_t dimension = data[0].size();

    std::vector<point> prev_medians(medians);

    medians.clear();
    medians.resize(clusters.size(), point(dimension, 0.0));

    double maximum_change = 0.0;

    for (std::size_t index_cluster = 0; index_cluster < clusters.size(); index_cluster++) {
        calculate_median(clusters[index_cluster], medians[index_cluster]);

        double change = m_metric(prev_medians[index_cluster], medians[index_cluster]);
        if (change > maximum_change) {
            maximum_change = change;
        }
    }

    return maximum_change;
}
Exemplo n.º 13
0
void kmedians::update_clusters(const dataset & medians, cluster_sequence & clusters) {
    const dataset & data = *m_ptr_data;

    clusters.clear();
    clusters.resize(medians.size());

    for (size_t index_point = 0; index_point < data.size(); index_point++) {
        size_t index_cluster_optim = 0;
        double distance_optim = std::numeric_limits<double>::max();

        for (size_t index_cluster = 0; index_cluster < medians.size(); index_cluster++) {
            double distance = m_metric(data[index_point], medians[index_cluster]);
            if (distance < distance_optim) {
                index_cluster_optim = index_cluster;
                distance_optim = distance;
            }
        }

        clusters[index_cluster_optim].push_back(index_point);
    }

    erase_empty_clusters(clusters);
}
Exemplo n.º 14
0
dataset_group dataset_splitter::split(const dataset & data) const
{
	dataset_group group;

	vector<vector<int>> batch_ids = this->split_impl(data);

	for (int i = 0;i<batch_ids.size();i++)
	{

		group.add_dataset(data.sub_set(batch_ids[i]));
	}

	return group;
}
Exemplo n.º 15
0
void kmeans::assign_point_to_cluster(const std::size_t p_index_point, const dataset & p_centers, std::vector<std::size_t> & p_clusters) {
    double    minimum_distance = std::numeric_limits<double>::max();
    size_t    suitable_index_cluster = 0;

    for (size_t index_cluster = 0; index_cluster < p_centers.size(); index_cluster++) {
        double distance = m_metric(p_centers[index_cluster], (*m_ptr_data)[p_index_point]);

        if (distance < minimum_distance) {
            minimum_distance = distance;
            suitable_index_cluster = index_cluster;
        }
    }

    p_clusters[p_index_point] = suitable_index_cluster;
}
Exemplo n.º 16
0
vector<vector<int>> random_shuffer_ratio_splitter ::split_impl(const dataset& data) const
{
	vector<NumericType> percent(ratio);
	
	NumericType total = std::accumulate(ratio.begin(),ratio.end(),0);

	BOOST_FOREACH(NumericType & x,percent){ x = x/total; }
//	std::transform(percent.begin(),percent.end(),percent.begin(),[total](NumericType val){return val/total;});

	vector<vector<int>> group_ids(percent.size());

	int sample_num = data.get_sample_num();
	vector<int> temp;

	std::copy(
		boost::counting_iterator<unsigned int>(0),
		boost::counting_iterator<unsigned int>(sample_num), 
		std::back_inserter(temp));

	std::random_shuffle ( temp.begin(), temp.end() );

	vector<int>::iterator cur_begin_iter = temp.begin();
	for (int i = 0;i<percent.size();i++)
	{
		int cur_group_size = floor(sample_num * percent[i]);
		vector<int>::iterator  cur_end_iter = cur_begin_iter + cur_group_size;
		if (i == percent.size()-1)
		{
			cur_end_iter = temp.end();
			cur_group_size = cur_end_iter - cur_begin_iter;
		}
		vector<int> cur_group_id(cur_group_size);

		copy(cur_begin_iter,cur_end_iter, cur_group_id.begin());

		cur_begin_iter = cur_end_iter;

		group_ids[i] = cur_group_id;

	}

	return group_ids;
	
}
Exemplo n.º 17
0
typename boost::enable_if<is_multi_array<T>, void>::type
read_dataset(dataset & data_set, T & array, dataspace const& memspace, dataspace const& filespace)
{
    // --- disabled this check, it is orthogonal to a useful feature (eg read from 2D dataset into 1D array)
//    const int array_rank = T::dimensionality;
//    if (!(memspace.rank() == array_rank)) {
//        throw error("memory dataspace and array rank do not match");
//    }

    if (static_cast<hsize_t>(filespace.get_select_npoints()) > array.num_elements())
        H5XX_THROW("target array does not provide enough space to store selected dataspace elements");

    hid_t mem_space_id = memspace.hid(); //H5S_ALL;
    hid_t file_space_id = filespace.hid();
    hid_t xfer_plist_id = H5P_DEFAULT;

    typedef typename T::element value_type;
    data_set.read(ctype<value_type>::hid(), array.origin(), mem_space_id, file_space_id, xfer_plist_id);
}
Exemplo n.º 18
0
    /**
     * Constructor which learns a Chow-Liu tree from the given dataset.
     * @param X             Variables over which to learn a tree.
     * @param ds            Dataset to use for computing marginals.
     */
    chow_liu(const forward_range<typename F::variable_type*>& X_,
             const dataset<>& ds, const parameters& params = parameters())
      : params(params) {

      typedef typename F::variable_type variable_type;
      assert(ds.size() > 0);
      std::vector<variable_type*> X(X_.begin(), X_.end());
      if (X.size() == 0)
        return;

      // g will hold weights (mutual information) and factors F for each edge.
      typedef std::pair<double, F> edge_mi_pot;
      typedef undirected_graph<variable_type*, void_, edge_mi_pot> ig_type;
      ig_type g;
      foreach(variable_type* v, X)
        g.add_vertex(v);
      for (size_t i(0); i < X.size() - 1; ++i) {
        for (size_t j(i+1); j < X.size(); ++j) {
          typename F::domain_type
            edge_dom(make_domain<variable_type>(X[i],X[j]));
          F f((params.lambda < 0 ?
               learn_factor<F>::learn_marginal(edge_dom, ds) :
               learn_factor<F>::learn_marginal(edge_dom, ds, params.lambda)));
          double mi(f.mutual_information(make_domain(X[i]), make_domain(X[j])));
          g.add_edge(X[i], X[j], std::make_pair(mi, f));
          if (params.retain_edge_score_mapping) {
            edge_score_mapping_[edge_dom] = mi;
          }
        }
      }

      // Create a MST over the graph g.
      std::vector<F> mst_factors;
      kruskal_minimum_spanning_tree
        (g, transformed_output(back_inserter(mst_factors),
                               impl::mst_edge2f_functor<F>(g)),
         impl::mst_weight_functor<F>(g));

      // Create a decomposable model consisting of the cliques in mst_edges
      model_ *= mst_factors;
    }
Exemplo n.º 19
0
vector<vector<int>> ordered_dataset_splitter ::split_impl(const dataset& data) const
{
	vector<vector<int>> batch_ids(batch_num);
	int sample_num = data.get_sample_num();
	int batch_size = ceil(float(sample_num)/batch_num);

	for (int i = 0;i<batch_num;i++)
	{
		int cur_batch_size = batch_size;
		if (i == batch_num-1)
			cur_batch_size = sample_num - (batch_num-1)*batch_size;
		vector<int> cur_batch_id(cur_batch_size);

		for (int j = 0;j<cur_batch_size;j++)
			cur_batch_id[j] = i*batch_size + j;

		batch_ids.push_back(cur_batch_id);

	}

	return batch_ids;
}
Exemplo n.º 20
0
void read_data(ifstream & in, dataset & data) {
    string line;
    sequence seq;
    while (getline(in, line)) {
	strtokenizer tok(line, " \t\r\n");
	int len = tok.count_tokens();
	
	if (len <= 0) {
	    if (seq.size() > 0) {
		data.push_back(seq);
	    }
	    
	    seq.clear();
	    continue;
	}
	
	obsr ob;
	for (int i = 0; i < len; i++) {
	    ob.push_back(tok.token(i));
	}
	
	seq.push_back(ob);
    }
}
Exemplo n.º 21
0
object::object(const dataset& object_) : object_handle_(object_.native_handle())
{
}
Exemplo n.º 22
0
int model::predict(const dataset &tds, gsl_matrix **pp)
{
	int ret = -1;

	gsl_matrix *mat = NULL;
	gsl_matrix *ptv = NULL;
	gsl_matrix *km1 = NULL;
	gsl_matrix *km2 = NULL;
	gsl_matrix *res = NULL;
	gsl_matrix *stm = NULL;

        gsl_vector_view avg_col;
        gsl_vector_view dv;
        
	if (tds.ins_num() <= 0 || tds.fea_num() != (int)_col_mean->size) {
		ULIB_FATAL("invalid test dimensions, (ins_num=%d,fea_num=%d)",
                           tds.ins_num(), tds.fea_num());
		goto done;
	}

	mat = gsl_matrix_alloc(tds.ins_num(), tds.fea_num());
	if (mat == NULL) {
		ULIB_FATAL("couldn't allocate test feature matrix");
		goto done;
	}
	ptv = gsl_matrix_alloc(tds.ins_num(), 2);
	if (ptv == NULL) {
		ULIB_FATAL("couldn't allocate prediction matrix");
		goto done;
	}
        if (tds.get_matrix(mat)) {
                ULIB_FATAL("couldn't get test matrix");
                goto done;
        }
	dbg_print_mat(mat, "Test Matrix:");

	zero_out_mat(mat);
	norm_mat(mat);

	dbg_print_mat(mat, "Normalized Test Matrix:");

	km1 = comp_kern_mat(mat, _fm, _kern);
	if (km1 == NULL) {
		ULIB_FATAL("couldn't compute test1 kernel matrix");
		goto done;
	}
	dbg_print_mat(km1, "Test Kernel Matrix:");

	km2 = comp_kern_mat(mat, mat, _kern);
	if (km2 == NULL) {
		ULIB_FATAL("couldn't compute test2 kernel matrix");
		goto done;
	}
	dbg_print_mat(km1, "Test Kernel Matrix:");
	dv = gsl_matrix_diagonal(km2);

	res = gsl_matrix_alloc(km1->size1, _ikm->size2);
	if (res == NULL) {
		ULIB_FATAL("couldn't allocate temporary matrix");
		goto done;
	}
	stm = gsl_matrix_alloc(km2->size1, km2->size2);
	if (stm == NULL) {
		ULIB_FATAL("couldn't allocate std matrix");
		goto done;
	}

	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, km1, _ikm, 0.0, res);
	gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, res, km1, 0.0, stm);
        gsl_matrix_sub(km2, stm);
	dbg_print_mat(res, "Predictive Matrix:");
        avg_col = gsl_matrix_column(ptv, 0);
	gsl_blas_dgemv(CblasNoTrans, 1.0, res, _tv, 0.0, &avg_col.vector);
        gsl_vector_add_constant(&avg_col.vector, _t_avg);
        gsl_matrix_scale(km2, _t_std*_t_std);
	gsl_vector_add_constant(&dv.vector, _noise_var);
        for (size_t i = 0; i < km2->size1; ++i)
                gsl_matrix_set(ptv, i, 1, sqrt(gsl_vector_get(&dv.vector, i)));
	*pp = ptv;
	ptv = NULL;
	ret = 0;
done:
	gsl_matrix_free(mat);
	gsl_matrix_free(ptv);
	gsl_matrix_free(km1);
	gsl_matrix_free(km2);
	gsl_matrix_free(res);
	gsl_matrix_free(stm);
	return ret;
}
Exemplo n.º 23
0
// call this to compute precision, recall, and F1-measure
void evaluate(dataset & data, string & chunktype, labelset & labels, chunkset & chunks) {
    map<string, int> lbstr2int;
    map<int, string> lbint2str;
    vector<int> human_lb_count, model_lb_count, human_model_lb_count;

    int i;
    int num_labels = labels.size();
    
    for (i = 0; i < num_labels; i++) {
	lbstr2int.insert(pair<string, int>(labels[i], i));
	lbint2str.insert(pair<int, string>(i, labels[i]));
	
	human_lb_count.push_back(0);
	model_lb_count.push_back(0);
	human_model_lb_count.push_back(0);
    }

    // start to count
    dataset::iterator datait;
    sequence::iterator seqit;    
    for (datait = data.begin(); datait != data.end(); datait++) {
	for (seqit = datait->begin(); seqit != datait->end(); seqit++) {
	    int label = str_2_int(lbstr2int, (*seqit)[seqit->size() - 2]);
	    int model_label = str_2_int(lbstr2int, (*seqit)[seqit->size() - 1]);
	    
	    if (label >= 0 && label < num_labels) {
		human_lb_count[label]++;
	    }
	    
	    if (model_label >= 0 && model_label < num_labels) {
		model_lb_count[model_label]++;
	    }
	    
	    if (label == model_label && label >= 0 && label < num_labels) {
		human_model_lb_count[label]++;
	    }
	}
    }
    
    // print out    
    printf("\tLabel-based performance evaluation:\n\n");
    printf("\t\tLabel\tManual\tModel\tMatch\tPre.(%)\tRec.(%)\tF1-Measure(%)\n");
    printf("\t\t-----\t------\t-----\t-----\t-------\t-------\t-------------\n");

    int count = 0;
    double precision = 0.0, recall = 0.0, f1, total1_pre = 0.0, 
	   total1_rec = 0.0, total1_f1 = 0.0, total2_pre = 0.0, 
	   total2_rec = 0.0, total2_f1 = 0.0;
    int total_human = 0, total_model = 0, total_match = 0;
    
    for (i = 0; i < num_labels; i++) {
	if (model_lb_count[i] > 0) {
	    precision = (double)human_model_lb_count[i] / model_lb_count[i];
	    total_model += model_lb_count[i];
	    total1_pre += precision;
	} else {
	    precision = 0.0;
	}
	if (human_lb_count[i] > 0) {
	    recall = (double)human_model_lb_count[i] / human_lb_count[i];
	    total_human += human_lb_count[i];
	    total1_rec += recall;
	    count++;
	} else {
	    recall = 0.0;
	}
	
	total_match += human_model_lb_count[i];
	
	if (recall + precision > 0) {
	    f1 = (double) 2 * precision * recall / (precision + recall);
	} else {
	    f1 = 0;
	}
	
	char buff[50];
	sprintf(buff, "%d", i);
	string strlabel = int_2_str(lbint2str, i);
	if (strlabel != "") {	
	    sprintf(buff, "%s", strlabel.c_str());
	}
	
	printf("\t\t%s\t%d\t%d\t%d\t%6.2f\t%6.2f\t%6.2f\n",
	      buff, human_lb_count[i], model_lb_count[i], human_model_lb_count[i],
	      precision * 100, recall * 100, f1 * 100);	      
    }
    
    total1_pre /= count;
    total1_rec /= count;
    total1_f1 = 2 * total1_pre * total1_rec / (total1_pre + total1_rec);

    // print the average performance    
    total2_pre = (double)total_match / total_model;
    total2_rec = (double)total_match / total_human;
    total2_f1 = 2 * total2_pre * total2_rec / (total2_pre + total2_rec);
    printf("\t\t-----\t------\t-----\t-----\t-------\t-------\t-------------\n");
    printf("\t\tAvg1.\t\t\t\t%6.2f\t%6.2f\t%6.2f\n", total1_pre * 100, total1_rec * 100,
	    total1_f1 * 100);
    printf("\t\tAvg2.\t%d\t%d\t%d\t%6.2f\t%6.2f\t%6.2f\n\n", total_human, total_model, total_match,
	    total2_pre * 100, total2_rec * 100, total2_f1 * 100);

    if (chunks.size() <= 0) {
	return;
    }

    // chunk based evaluation
    if (chunktype == "IOB1") {
	chunk_evaluate_iob1(data, chunks);
    }
    if (chunktype == "IOB2") {
	chunk_evaluate_iob2(data, chunks);
    }
    if (chunktype == "IOE1") {
	chunk_evaluate_ioe1(data, chunks);
    }
    if (chunktype == "IOE2") {
	chunk_evaluate_ioe2(data, chunks);
    }
}
Exemplo n.º 24
0
double chunk_evaluate_ioe1(dataset & data, chunkset & chunks) {
    vector<int> human_chk_count;
    vector<int> model_chk_count;
    vector<int> match_chk_count;
    int i;

    int num_chunks = chunks.size();
    for (i = 0; i < num_chunks; i++) {
	human_chk_count.push_back(0);
	model_chk_count.push_back(0);
	match_chk_count.push_back(0);
    }

    dataset::iterator datait;
    for (datait = data.begin(); datait != data.end(); datait++) {
	for (i = 0; i < num_chunks; i++) {
	    human_chk_count[i] += count_chunks_ioe1(1, *datait, chunks[i][0], chunks[i][1]);
	    model_chk_count[i] += count_chunks_ioe1(2, *datait, chunks[i][0], chunks[i][1]);
	    match_chk_count[i] += count_matching_chunks_ioe1(*datait, chunks[i][0], chunks[i][1]);
	}
    }

    printf("\tChunk-based performance evaluation:\n\n");
    printf("\t\tChunk\tManual\tModel\tMatch\tPre.(%)\tRec.(%)\tF1-Measure(%)\n");
    printf("\t\t-----\t------\t-----\t-----\t-------\t-------\t-------------\n");

    int count = 0;
    double pre = 0.0, rec = 0.0, f1 = 0.0;
    double total1_pre = 0.0, total1_rec = 0.0, total1_f1 = 0.0;
    double total2_pre = 0.0, total2_rec = 0.0, total2_f1 = 0.0;
    int total_human = 0, total_model = 0, total_match = 0;

    for (i = 0; i < num_chunks; i++) {
	if (model_chk_count[i] > 0) {
	    pre = (double)match_chk_count[i] / model_chk_count[i];
	    total_model += model_chk_count[i];
	    total1_pre += pre;
	} else {
	    pre = 0.0;
	}

	if (human_chk_count[i] > 0) {
	    rec = (double)match_chk_count[i] / human_chk_count[i];
	    total_human += human_chk_count[i];
	    total1_rec += rec;
	    count++;
	} else {
	    rec = 0.0;
	}

	total_match += match_chk_count[i];

	if (pre + rec > 0) {
	    f1 = (double) 2 * pre * rec / (pre + rec);
	} else {
	    f1 = 0.0;
	}

	printf("\t\t%s\t%d\t%d\t%d\t%6.2f\t%6.2f\t%6.2f\n",
	       chunks[i][2].c_str(), human_chk_count[i], model_chk_count[i], 
	       match_chk_count[i], pre * 100, rec * 100, f1 * 100);
    }

    printf("\t\t-----\t------\t-----\t-----\t-------\t-------\t-------------\n");

    if (count > 0) {
	total1_pre /= count;
	total1_rec /= count;
	if (total1_pre + total1_rec > 0) {
	    total1_f1 = 2 * total1_pre * total1_rec / (total1_pre + total1_rec);
	}
	printf("\t\tAvg1.\t\t\t\t%6.2f\t%6.2f\t%6.2f\n",
	       total1_pre * 100, total1_rec * 100, total1_f1 * 100);
    }
    
    if (total_model > 0) {
	total2_pre = (double)total_match / total_model;
    }
    if (total_human > 0) {
	total2_rec = (double)total_match / total_human;
    }
    if (total2_pre + total2_rec > 0) {
	total2_f1 = 2 * total2_rec * total2_pre / (total2_rec + total2_pre);
    }
    
    printf("\t\tAvg2.\t%d\t%d\t%d\t%6.2f\t%6.2f\t%6.2f\n\n",
	   total_human, total_model, total_match,
	   total2_pre * 100, total2_rec * 100, total2_f1 * 100);

    return total2_f1 * 100;
}
Exemplo n.º 25
0
void attach_dimension_scale(const dimension_scale& dim_scale, const dataset& to,
                            unsigned int dimension)
{
    if (H5DSattach_scale(to.id(), dim_scale.id(), dimension) < 0)
        throw_on_hdf5_error();
}
Exemplo n.º 26
0
void ASSERT_CLUSTER_NOISE_SIZES(
    const dataset & p_data,
    const cluster_sequence & p_actual_clusters,
    const std::vector<std::size_t> & p_expected_cluster_length,
    const noise & p_actual_noise,
    const std::size_t & p_expected_noise_length,
    const index_sequence & p_indexes)
{
    if (p_expected_cluster_length.empty() && p_actual_clusters.empty()) {
        return;
    }

    std::size_t total_size = 0;
    std::unordered_map<std::size_t, bool> unique_objects;
    std::vector<std::size_t> obtained_cluster_length;

    for (auto & cluster : p_actual_clusters) {
        total_size += cluster.size();

        obtained_cluster_length.push_back(cluster.size());

        for (auto index_object : cluster) {
            unique_objects[index_object] = false;
        }
    }

    total_size += p_actual_noise.size();
    for (auto index_object : p_actual_noise) {
        unique_objects[index_object] = false;
    }

    ASSERT_EQ(total_size, unique_objects.size());

    if (!p_expected_cluster_length.empty()) {
        std::size_t expected_total_size = std::accumulate(p_expected_cluster_length.cbegin(), p_expected_cluster_length.cend(), (std::size_t) 0);
        if (p_expected_noise_length != (std::size_t) -1) {
             expected_total_size += p_expected_noise_length;
        }

        ASSERT_EQ(expected_total_size, total_size);

        std::sort(obtained_cluster_length.begin(), obtained_cluster_length.end());

        std::vector<size_t> sorted_expected_cluster_length(p_expected_cluster_length);
        std::sort(sorted_expected_cluster_length.begin(), sorted_expected_cluster_length.end());

        for (size_t i = 0; i < obtained_cluster_length.size(); i++) {
            ASSERT_EQ(obtained_cluster_length[i], sorted_expected_cluster_length[i]);
        }
    }
    else
    {
        if (!p_indexes.empty()) {
            ASSERT_EQ(p_indexes.size(), unique_objects.size());

            for (auto index : p_indexes) {
                ASSERT_TRUE( unique_objects.find(index) != unique_objects.cend() );
            }
        }
        else {
            ASSERT_EQ(p_data.size(), total_size);
        }
    }

    if (p_expected_noise_length != (std::size_t) -1) {
        ASSERT_EQ(p_expected_noise_length, p_actual_noise.size());
    }
}