F foreach_well(const dataset& data, F fn, std::string id_field) { const auto& id = data.at(id_field); std::size_t begin_rec = 0, end_rec = 0; for (std::size_t i = 0; i < id.size(); ++i) { if (id[i] != id[begin_rec] || i == id.size() - 1) { if (i == id.size() - 1) end_rec = i; dataset well; std::for_each(data.begin(), data.end(), [&](const std::pair<std::string, std::vector<std::string>>& column) { well[column.first] = std::vector<std::string>( column.second.data() + begin_rec, column.second.data() + end_rec + 1); } ); fn(well); begin_rec = i; } end_rec = i; } return fn; }
int model::load_training_data(const dataset &ds) { int nrow, ncol; nrow = ds.ins_num(); ncol = ds.fea_num(); if (nrow <= 0 || ncol < 1) { ULIB_FATAL("invalid training data dimensions"); return -1; } if (nrow > FLAGS_max_num_examples) nrow = FLAGS_max_num_examples; if (alloc_training_data(nrow, ncol)) { ULIB_FATAL("couldn't allocate training data"); return -1; } double tavg = 0; double tvar = 0; for (int i = 0; i < nrow; ++i) { double t = ds.get_tgv(i); tavg += t; tvar += t*t; gsl_vector_set(_tv, i, t); for (int j = 0; j < ncol; ++j) gsl_matrix_set(_fm, i, j, ds.get_fea(i, j)); } _t_avg = tavg/nrow; _t_std = sqrt(tvar/nrow - _t_avg*_t_avg); return 0; }
void slam::slam_data<ControlModel, ObservationModel> ::add_dataset (const dataset<ControlModel, ObservationModel>& data, const typename ControlModel::builder& control_model_builder, const typename ObservationModel::builder& obs_model_builder) { using namespace boost::adaptors; auto add_observations = [&](timestep_type t) { for (const auto& obs : values(data.observations_at(t))) { add_observation (obs.id, obs_model_builder(obs.observation)); } }; add_observations (current_timestep()); timestep (current_timestep()); while (current_timestep() < data.current_timestep()) { add_control (control_model_builder (data.control(current_timestep()), data.timedelta(current_timestep()))); add_observations (current_timestep()); timestep (current_timestep()); } completed(); }
void silhouette_ksearch::process(const dataset & p_data, silhouette_ksearch_data & p_result) { if (m_kmax > p_data.size()) { throw std::invalid_argument("K max value '" + std::to_string(m_kmax) + "' should be bigger than amount of objects '" + std::to_string(p_data.size()) + "' in input data."); } p_result.scores().reserve(m_kmax - m_kmin); for (std::size_t k = m_kmin; k < m_kmax; k++) { cluster_sequence clusters; m_allocator->allocate(k, p_data, clusters); if (clusters.size() != k) { p_result.scores().push_back(std::nan("1")); continue; } silhouette_data result; silhouette().process(p_data, clusters, result); const double score = std::accumulate(result.get_score().begin(), result.get_score().end(), (double) 0.0) / result.get_score().size(); p_result.scores().push_back(score); if (score > p_result.get_score()) { p_result.set_amount(k); p_result.set_score(score); } } }
// generate a result set from two sets of datapoints of which the first set contains all // datapoints with other datapoints in the buffer zone and the of which the second set // contains all datapoints without other datapoints in the buffer zone dataset generateSet(dataset& withNearbyDataset, dataset& standaloneDataset) { random_device rd; mt19937 rng(rd()); dataset remainingDataset(withNearbyDataset.begin(), withNearbyDataset.end()); dataset resultSet(standaloneDataset.begin(), standaloneDataset.end()); while (remainingDataset.size() != 0) { // create iterator dataset::iterator it = remainingDataset.begin(); // generate random index uniform_int_distribution<int> uni(0, (int)remainingDataset.size()); int r = uni(rng); // pick random datapoint by advancing the iterator to the random position advance(it, r % remainingDataset.size()); // add picked datapoint to result list resultSet.insert(*it); // remove all datapoints within buffer zone if still in remaining dataset for (dataset::iterator j = it->buffer.begin(); j != it->buffer.end(); ++j) { dataset::iterator tmp = remainingDataset.find(*j); if (tmp != remainingDataset.end()) { remainingDataset.erase(tmp); } } // remove picked datapoint from remaining list remainingDataset.erase(remainingDataset.find(*it)); } return resultSet; }
void experiment_datasets::set_train_test_pairs(const dataset & train, const dataset & test, int pair_num) { shared_ptr<dataset> p_test_set(test.clone()); shared_ptr<dataset> p_train_set(train.clone()); train_test_pairs.erase(train_test_pairs.begin(),train_test_pairs.end()); for (int i = 0; i < pair_num; i++) { train_test_pairs.push_back(train_test_pair(p_train_set,p_test_set)); } }
void kmeans::update_clusters(const dataset & p_centers, cluster_sequence & p_clusters) { const dataset & data = *m_ptr_data; p_clusters.clear(); p_clusters.resize(p_centers.size()); /* fill clusters again in line with centers. */ if (m_ptr_indexes->empty()) { std::vector<std::size_t> winners(data.size(), 0); parallel_for(std::size_t(0), data.size(), [this, &p_centers, &winners](std::size_t p_index) { assign_point_to_cluster(p_index, p_centers, winners); }); for (std::size_t index_point = 0; index_point < winners.size(); index_point++) { const std::size_t suitable_index_cluster = winners[index_point]; p_clusters[suitable_index_cluster].push_back(index_point); } } else { /* This part of code is used by X-Means and in case of parallel implementation of this part in scope of X-Means performance is slightly reduced. Experiments has been performed our implementation and Intel TBB library. But in K-Means case only - it works perfectly and increase performance. */ std::vector<std::size_t> winners(data.size(), 0); parallel_for_each(*m_ptr_indexes, [this, &p_centers, &winners](std::size_t p_index) { assign_point_to_cluster(p_index, p_centers, winners); }); for (std::size_t index_point : *m_ptr_indexes) { const std::size_t suitable_index_cluster = winners[index_point]; p_clusters[suitable_index_cluster].push_back(index_point); } } erase_empty_clusters(p_clusters); }
vector<vector<int>> random_shuffer_dataset_splitter ::split_impl(const dataset& data) const { vector<vector<int>> batch_ids(batch_num); int sample_num = data.get_sample_num(); vector<int> temp(sample_num); for (int i = 0;i<sample_num;i++) temp[i] = i; std::random_shuffle ( temp.begin(), temp.end() ); int batch_size = ceil(float(sample_num)/batch_num); for (int i = 0;i<batch_num;i++) { int cur_batch_size = batch_size; if (i == batch_num-1) cur_batch_size = sample_num - (batch_num-1)*batch_size; vector<int> cur_batch_id(cur_batch_size); for (int j = 0;j<cur_batch_size;j++) cur_batch_id[j] = temp[i*batch_size + j]; batch_ids[i] = cur_batch_id; } return batch_ids; }
inline typename boost::enable_if<is_multi_array<T>, void>::type write_dataset(dataset& dset, T const& value) { typedef typename T::element value_type; hid_t type_id = ctype<value_type>::hid(); dset.write(type_id, value.origin()); }
typename boost::enable_if<is_multi_array<T>, void>::type read_dataset(dataset & data_set, T & array) { const int array_rank = T::dimensionality; typedef typename T::element value_type; // --- use temporary dataspace object to get the shape of the dataset dataspace file_space(data_set); if (!(file_space.rank() == array_rank)) H5XX_THROW("dataset \"" + get_name(data_set) + "\" and target array have mismatching dimensions"); boost::array<hsize_t, array_rank> file_dims = file_space.extents<array_rank>(); // --- clear array - TODO check if this feature is necessary/wanted boost::array<size_t, array_rank> array_zero; array_zero.assign(0); array.resize(array_zero); // --- resize array to match the dataset - TODO check if this feature is necessary/wanted boost::array<size_t, array_rank> array_shape; std::copy(file_dims.begin(), file_dims.begin() + array_rank, array_shape.begin()); array.resize(array_shape); hid_t mem_space_id = H5S_ALL; hid_t file_space_id = H5S_ALL; hid_t xfer_plist_id = H5P_DEFAULT; data_set.read(ctype<value_type>::hid(), array.origin(), mem_space_id, file_space_id, xfer_plist_id); }
inline typename boost::enable_if<is_multi_array<T>, void>::type write_dataset(dataset& dset, T const& value, dataspace const& memspace, dataspace const& filespace) { typedef typename T::element value_type; hid_t type_id = ctype<value_type>::hid(); hid_t mem_space_id = memspace.hid(); //H5S_ALL; hid_t file_space_id = filespace.hid(); hid_t xfer_plist_id = H5P_DEFAULT; dset.write(type_id, value.origin(), mem_space_id, file_space_id, xfer_plist_id); }
double kmedians::update_medians(cluster_sequence & clusters, dataset & medians) { const dataset & data = *m_ptr_data; const std::size_t dimension = data[0].size(); std::vector<point> prev_medians(medians); medians.clear(); medians.resize(clusters.size(), point(dimension, 0.0)); double maximum_change = 0.0; for (std::size_t index_cluster = 0; index_cluster < clusters.size(); index_cluster++) { calculate_median(clusters[index_cluster], medians[index_cluster]); double change = m_metric(prev_medians[index_cluster], medians[index_cluster]); if (change > maximum_change) { maximum_change = change; } } return maximum_change; }
void kmedians::update_clusters(const dataset & medians, cluster_sequence & clusters) { const dataset & data = *m_ptr_data; clusters.clear(); clusters.resize(medians.size()); for (size_t index_point = 0; index_point < data.size(); index_point++) { size_t index_cluster_optim = 0; double distance_optim = std::numeric_limits<double>::max(); for (size_t index_cluster = 0; index_cluster < medians.size(); index_cluster++) { double distance = m_metric(data[index_point], medians[index_cluster]); if (distance < distance_optim) { index_cluster_optim = index_cluster; distance_optim = distance; } } clusters[index_cluster_optim].push_back(index_point); } erase_empty_clusters(clusters); }
dataset_group dataset_splitter::split(const dataset & data) const { dataset_group group; vector<vector<int>> batch_ids = this->split_impl(data); for (int i = 0;i<batch_ids.size();i++) { group.add_dataset(data.sub_set(batch_ids[i])); } return group; }
void kmeans::assign_point_to_cluster(const std::size_t p_index_point, const dataset & p_centers, std::vector<std::size_t> & p_clusters) { double minimum_distance = std::numeric_limits<double>::max(); size_t suitable_index_cluster = 0; for (size_t index_cluster = 0; index_cluster < p_centers.size(); index_cluster++) { double distance = m_metric(p_centers[index_cluster], (*m_ptr_data)[p_index_point]); if (distance < minimum_distance) { minimum_distance = distance; suitable_index_cluster = index_cluster; } } p_clusters[p_index_point] = suitable_index_cluster; }
vector<vector<int>> random_shuffer_ratio_splitter ::split_impl(const dataset& data) const { vector<NumericType> percent(ratio); NumericType total = std::accumulate(ratio.begin(),ratio.end(),0); BOOST_FOREACH(NumericType & x,percent){ x = x/total; } // std::transform(percent.begin(),percent.end(),percent.begin(),[total](NumericType val){return val/total;}); vector<vector<int>> group_ids(percent.size()); int sample_num = data.get_sample_num(); vector<int> temp; std::copy( boost::counting_iterator<unsigned int>(0), boost::counting_iterator<unsigned int>(sample_num), std::back_inserter(temp)); std::random_shuffle ( temp.begin(), temp.end() ); vector<int>::iterator cur_begin_iter = temp.begin(); for (int i = 0;i<percent.size();i++) { int cur_group_size = floor(sample_num * percent[i]); vector<int>::iterator cur_end_iter = cur_begin_iter + cur_group_size; if (i == percent.size()-1) { cur_end_iter = temp.end(); cur_group_size = cur_end_iter - cur_begin_iter; } vector<int> cur_group_id(cur_group_size); copy(cur_begin_iter,cur_end_iter, cur_group_id.begin()); cur_begin_iter = cur_end_iter; group_ids[i] = cur_group_id; } return group_ids; }
typename boost::enable_if<is_multi_array<T>, void>::type read_dataset(dataset & data_set, T & array, dataspace const& memspace, dataspace const& filespace) { // --- disabled this check, it is orthogonal to a useful feature (eg read from 2D dataset into 1D array) // const int array_rank = T::dimensionality; // if (!(memspace.rank() == array_rank)) { // throw error("memory dataspace and array rank do not match"); // } if (static_cast<hsize_t>(filespace.get_select_npoints()) > array.num_elements()) H5XX_THROW("target array does not provide enough space to store selected dataspace elements"); hid_t mem_space_id = memspace.hid(); //H5S_ALL; hid_t file_space_id = filespace.hid(); hid_t xfer_plist_id = H5P_DEFAULT; typedef typename T::element value_type; data_set.read(ctype<value_type>::hid(), array.origin(), mem_space_id, file_space_id, xfer_plist_id); }
/** * Constructor which learns a Chow-Liu tree from the given dataset. * @param X Variables over which to learn a tree. * @param ds Dataset to use for computing marginals. */ chow_liu(const forward_range<typename F::variable_type*>& X_, const dataset<>& ds, const parameters& params = parameters()) : params(params) { typedef typename F::variable_type variable_type; assert(ds.size() > 0); std::vector<variable_type*> X(X_.begin(), X_.end()); if (X.size() == 0) return; // g will hold weights (mutual information) and factors F for each edge. typedef std::pair<double, F> edge_mi_pot; typedef undirected_graph<variable_type*, void_, edge_mi_pot> ig_type; ig_type g; foreach(variable_type* v, X) g.add_vertex(v); for (size_t i(0); i < X.size() - 1; ++i) { for (size_t j(i+1); j < X.size(); ++j) { typename F::domain_type edge_dom(make_domain<variable_type>(X[i],X[j])); F f((params.lambda < 0 ? learn_factor<F>::learn_marginal(edge_dom, ds) : learn_factor<F>::learn_marginal(edge_dom, ds, params.lambda))); double mi(f.mutual_information(make_domain(X[i]), make_domain(X[j]))); g.add_edge(X[i], X[j], std::make_pair(mi, f)); if (params.retain_edge_score_mapping) { edge_score_mapping_[edge_dom] = mi; } } } // Create a MST over the graph g. std::vector<F> mst_factors; kruskal_minimum_spanning_tree (g, transformed_output(back_inserter(mst_factors), impl::mst_edge2f_functor<F>(g)), impl::mst_weight_functor<F>(g)); // Create a decomposable model consisting of the cliques in mst_edges model_ *= mst_factors; }
vector<vector<int>> ordered_dataset_splitter ::split_impl(const dataset& data) const { vector<vector<int>> batch_ids(batch_num); int sample_num = data.get_sample_num(); int batch_size = ceil(float(sample_num)/batch_num); for (int i = 0;i<batch_num;i++) { int cur_batch_size = batch_size; if (i == batch_num-1) cur_batch_size = sample_num - (batch_num-1)*batch_size; vector<int> cur_batch_id(cur_batch_size); for (int j = 0;j<cur_batch_size;j++) cur_batch_id[j] = i*batch_size + j; batch_ids.push_back(cur_batch_id); } return batch_ids; }
void read_data(ifstream & in, dataset & data) { string line; sequence seq; while (getline(in, line)) { strtokenizer tok(line, " \t\r\n"); int len = tok.count_tokens(); if (len <= 0) { if (seq.size() > 0) { data.push_back(seq); } seq.clear(); continue; } obsr ob; for (int i = 0; i < len; i++) { ob.push_back(tok.token(i)); } seq.push_back(ob); } }
object::object(const dataset& object_) : object_handle_(object_.native_handle()) { }
int model::predict(const dataset &tds, gsl_matrix **pp) { int ret = -1; gsl_matrix *mat = NULL; gsl_matrix *ptv = NULL; gsl_matrix *km1 = NULL; gsl_matrix *km2 = NULL; gsl_matrix *res = NULL; gsl_matrix *stm = NULL; gsl_vector_view avg_col; gsl_vector_view dv; if (tds.ins_num() <= 0 || tds.fea_num() != (int)_col_mean->size) { ULIB_FATAL("invalid test dimensions, (ins_num=%d,fea_num=%d)", tds.ins_num(), tds.fea_num()); goto done; } mat = gsl_matrix_alloc(tds.ins_num(), tds.fea_num()); if (mat == NULL) { ULIB_FATAL("couldn't allocate test feature matrix"); goto done; } ptv = gsl_matrix_alloc(tds.ins_num(), 2); if (ptv == NULL) { ULIB_FATAL("couldn't allocate prediction matrix"); goto done; } if (tds.get_matrix(mat)) { ULIB_FATAL("couldn't get test matrix"); goto done; } dbg_print_mat(mat, "Test Matrix:"); zero_out_mat(mat); norm_mat(mat); dbg_print_mat(mat, "Normalized Test Matrix:"); km1 = comp_kern_mat(mat, _fm, _kern); if (km1 == NULL) { ULIB_FATAL("couldn't compute test1 kernel matrix"); goto done; } dbg_print_mat(km1, "Test Kernel Matrix:"); km2 = comp_kern_mat(mat, mat, _kern); if (km2 == NULL) { ULIB_FATAL("couldn't compute test2 kernel matrix"); goto done; } dbg_print_mat(km1, "Test Kernel Matrix:"); dv = gsl_matrix_diagonal(km2); res = gsl_matrix_alloc(km1->size1, _ikm->size2); if (res == NULL) { ULIB_FATAL("couldn't allocate temporary matrix"); goto done; } stm = gsl_matrix_alloc(km2->size1, km2->size2); if (stm == NULL) { ULIB_FATAL("couldn't allocate std matrix"); goto done; } gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, km1, _ikm, 0.0, res); gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, res, km1, 0.0, stm); gsl_matrix_sub(km2, stm); dbg_print_mat(res, "Predictive Matrix:"); avg_col = gsl_matrix_column(ptv, 0); gsl_blas_dgemv(CblasNoTrans, 1.0, res, _tv, 0.0, &avg_col.vector); gsl_vector_add_constant(&avg_col.vector, _t_avg); gsl_matrix_scale(km2, _t_std*_t_std); gsl_vector_add_constant(&dv.vector, _noise_var); for (size_t i = 0; i < km2->size1; ++i) gsl_matrix_set(ptv, i, 1, sqrt(gsl_vector_get(&dv.vector, i))); *pp = ptv; ptv = NULL; ret = 0; done: gsl_matrix_free(mat); gsl_matrix_free(ptv); gsl_matrix_free(km1); gsl_matrix_free(km2); gsl_matrix_free(res); gsl_matrix_free(stm); return ret; }
// call this to compute precision, recall, and F1-measure void evaluate(dataset & data, string & chunktype, labelset & labels, chunkset & chunks) { map<string, int> lbstr2int; map<int, string> lbint2str; vector<int> human_lb_count, model_lb_count, human_model_lb_count; int i; int num_labels = labels.size(); for (i = 0; i < num_labels; i++) { lbstr2int.insert(pair<string, int>(labels[i], i)); lbint2str.insert(pair<int, string>(i, labels[i])); human_lb_count.push_back(0); model_lb_count.push_back(0); human_model_lb_count.push_back(0); } // start to count dataset::iterator datait; sequence::iterator seqit; for (datait = data.begin(); datait != data.end(); datait++) { for (seqit = datait->begin(); seqit != datait->end(); seqit++) { int label = str_2_int(lbstr2int, (*seqit)[seqit->size() - 2]); int model_label = str_2_int(lbstr2int, (*seqit)[seqit->size() - 1]); if (label >= 0 && label < num_labels) { human_lb_count[label]++; } if (model_label >= 0 && model_label < num_labels) { model_lb_count[model_label]++; } if (label == model_label && label >= 0 && label < num_labels) { human_model_lb_count[label]++; } } } // print out printf("\tLabel-based performance evaluation:\n\n"); printf("\t\tLabel\tManual\tModel\tMatch\tPre.(%)\tRec.(%)\tF1-Measure(%)\n"); printf("\t\t-----\t------\t-----\t-----\t-------\t-------\t-------------\n"); int count = 0; double precision = 0.0, recall = 0.0, f1, total1_pre = 0.0, total1_rec = 0.0, total1_f1 = 0.0, total2_pre = 0.0, total2_rec = 0.0, total2_f1 = 0.0; int total_human = 0, total_model = 0, total_match = 0; for (i = 0; i < num_labels; i++) { if (model_lb_count[i] > 0) { precision = (double)human_model_lb_count[i] / model_lb_count[i]; total_model += model_lb_count[i]; total1_pre += precision; } else { precision = 0.0; } if (human_lb_count[i] > 0) { recall = (double)human_model_lb_count[i] / human_lb_count[i]; total_human += human_lb_count[i]; total1_rec += recall; count++; } else { recall = 0.0; } total_match += human_model_lb_count[i]; if (recall + precision > 0) { f1 = (double) 2 * precision * recall / (precision + recall); } else { f1 = 0; } char buff[50]; sprintf(buff, "%d", i); string strlabel = int_2_str(lbint2str, i); if (strlabel != "") { sprintf(buff, "%s", strlabel.c_str()); } printf("\t\t%s\t%d\t%d\t%d\t%6.2f\t%6.2f\t%6.2f\n", buff, human_lb_count[i], model_lb_count[i], human_model_lb_count[i], precision * 100, recall * 100, f1 * 100); } total1_pre /= count; total1_rec /= count; total1_f1 = 2 * total1_pre * total1_rec / (total1_pre + total1_rec); // print the average performance total2_pre = (double)total_match / total_model; total2_rec = (double)total_match / total_human; total2_f1 = 2 * total2_pre * total2_rec / (total2_pre + total2_rec); printf("\t\t-----\t------\t-----\t-----\t-------\t-------\t-------------\n"); printf("\t\tAvg1.\t\t\t\t%6.2f\t%6.2f\t%6.2f\n", total1_pre * 100, total1_rec * 100, total1_f1 * 100); printf("\t\tAvg2.\t%d\t%d\t%d\t%6.2f\t%6.2f\t%6.2f\n\n", total_human, total_model, total_match, total2_pre * 100, total2_rec * 100, total2_f1 * 100); if (chunks.size() <= 0) { return; } // chunk based evaluation if (chunktype == "IOB1") { chunk_evaluate_iob1(data, chunks); } if (chunktype == "IOB2") { chunk_evaluate_iob2(data, chunks); } if (chunktype == "IOE1") { chunk_evaluate_ioe1(data, chunks); } if (chunktype == "IOE2") { chunk_evaluate_ioe2(data, chunks); } }
double chunk_evaluate_ioe1(dataset & data, chunkset & chunks) { vector<int> human_chk_count; vector<int> model_chk_count; vector<int> match_chk_count; int i; int num_chunks = chunks.size(); for (i = 0; i < num_chunks; i++) { human_chk_count.push_back(0); model_chk_count.push_back(0); match_chk_count.push_back(0); } dataset::iterator datait; for (datait = data.begin(); datait != data.end(); datait++) { for (i = 0; i < num_chunks; i++) { human_chk_count[i] += count_chunks_ioe1(1, *datait, chunks[i][0], chunks[i][1]); model_chk_count[i] += count_chunks_ioe1(2, *datait, chunks[i][0], chunks[i][1]); match_chk_count[i] += count_matching_chunks_ioe1(*datait, chunks[i][0], chunks[i][1]); } } printf("\tChunk-based performance evaluation:\n\n"); printf("\t\tChunk\tManual\tModel\tMatch\tPre.(%)\tRec.(%)\tF1-Measure(%)\n"); printf("\t\t-----\t------\t-----\t-----\t-------\t-------\t-------------\n"); int count = 0; double pre = 0.0, rec = 0.0, f1 = 0.0; double total1_pre = 0.0, total1_rec = 0.0, total1_f1 = 0.0; double total2_pre = 0.0, total2_rec = 0.0, total2_f1 = 0.0; int total_human = 0, total_model = 0, total_match = 0; for (i = 0; i < num_chunks; i++) { if (model_chk_count[i] > 0) { pre = (double)match_chk_count[i] / model_chk_count[i]; total_model += model_chk_count[i]; total1_pre += pre; } else { pre = 0.0; } if (human_chk_count[i] > 0) { rec = (double)match_chk_count[i] / human_chk_count[i]; total_human += human_chk_count[i]; total1_rec += rec; count++; } else { rec = 0.0; } total_match += match_chk_count[i]; if (pre + rec > 0) { f1 = (double) 2 * pre * rec / (pre + rec); } else { f1 = 0.0; } printf("\t\t%s\t%d\t%d\t%d\t%6.2f\t%6.2f\t%6.2f\n", chunks[i][2].c_str(), human_chk_count[i], model_chk_count[i], match_chk_count[i], pre * 100, rec * 100, f1 * 100); } printf("\t\t-----\t------\t-----\t-----\t-------\t-------\t-------------\n"); if (count > 0) { total1_pre /= count; total1_rec /= count; if (total1_pre + total1_rec > 0) { total1_f1 = 2 * total1_pre * total1_rec / (total1_pre + total1_rec); } printf("\t\tAvg1.\t\t\t\t%6.2f\t%6.2f\t%6.2f\n", total1_pre * 100, total1_rec * 100, total1_f1 * 100); } if (total_model > 0) { total2_pre = (double)total_match / total_model; } if (total_human > 0) { total2_rec = (double)total_match / total_human; } if (total2_pre + total2_rec > 0) { total2_f1 = 2 * total2_rec * total2_pre / (total2_rec + total2_pre); } printf("\t\tAvg2.\t%d\t%d\t%d\t%6.2f\t%6.2f\t%6.2f\n\n", total_human, total_model, total_match, total2_pre * 100, total2_rec * 100, total2_f1 * 100); return total2_f1 * 100; }
void attach_dimension_scale(const dimension_scale& dim_scale, const dataset& to, unsigned int dimension) { if (H5DSattach_scale(to.id(), dim_scale.id(), dimension) < 0) throw_on_hdf5_error(); }
void ASSERT_CLUSTER_NOISE_SIZES( const dataset & p_data, const cluster_sequence & p_actual_clusters, const std::vector<std::size_t> & p_expected_cluster_length, const noise & p_actual_noise, const std::size_t & p_expected_noise_length, const index_sequence & p_indexes) { if (p_expected_cluster_length.empty() && p_actual_clusters.empty()) { return; } std::size_t total_size = 0; std::unordered_map<std::size_t, bool> unique_objects; std::vector<std::size_t> obtained_cluster_length; for (auto & cluster : p_actual_clusters) { total_size += cluster.size(); obtained_cluster_length.push_back(cluster.size()); for (auto index_object : cluster) { unique_objects[index_object] = false; } } total_size += p_actual_noise.size(); for (auto index_object : p_actual_noise) { unique_objects[index_object] = false; } ASSERT_EQ(total_size, unique_objects.size()); if (!p_expected_cluster_length.empty()) { std::size_t expected_total_size = std::accumulate(p_expected_cluster_length.cbegin(), p_expected_cluster_length.cend(), (std::size_t) 0); if (p_expected_noise_length != (std::size_t) -1) { expected_total_size += p_expected_noise_length; } ASSERT_EQ(expected_total_size, total_size); std::sort(obtained_cluster_length.begin(), obtained_cluster_length.end()); std::vector<size_t> sorted_expected_cluster_length(p_expected_cluster_length); std::sort(sorted_expected_cluster_length.begin(), sorted_expected_cluster_length.end()); for (size_t i = 0; i < obtained_cluster_length.size(); i++) { ASSERT_EQ(obtained_cluster_length[i], sorted_expected_cluster_length[i]); } } else { if (!p_indexes.empty()) { ASSERT_EQ(p_indexes.size(), unique_objects.size()); for (auto index : p_indexes) { ASSERT_TRUE( unique_objects.find(index) != unique_objects.cend() ); } } else { ASSERT_EQ(p_data.size(), total_size); } } if (p_expected_noise_length != (std::size_t) -1) { ASSERT_EQ(p_expected_noise_length, p_actual_noise.size()); } }