void silhouette_ksearch::process(const dataset & p_data, silhouette_ksearch_data & p_result) { if (m_kmax > p_data.size()) { throw std::invalid_argument("K max value '" + std::to_string(m_kmax) + "' should be bigger than amount of objects '" + std::to_string(p_data.size()) + "' in input data."); } p_result.scores().reserve(m_kmax - m_kmin); for (std::size_t k = m_kmin; k < m_kmax; k++) { cluster_sequence clusters; m_allocator->allocate(k, p_data, clusters); if (clusters.size() != k) { p_result.scores().push_back(std::nan("1")); continue; } silhouette_data result; silhouette().process(p_data, clusters, result); const double score = std::accumulate(result.get_score().begin(), result.get_score().end(), (double) 0.0) / result.get_score().size(); p_result.scores().push_back(score); if (score > p_result.get_score()) { p_result.set_amount(k); p_result.set_score(score); } } }
void kmeans::update_clusters(const dataset & p_centers, cluster_sequence & p_clusters) { const dataset & data = *m_ptr_data; p_clusters.clear(); p_clusters.resize(p_centers.size()); /* fill clusters again in line with centers. */ if (m_ptr_indexes->empty()) { std::vector<std::size_t> winners(data.size(), 0); parallel_for(std::size_t(0), data.size(), [this, &p_centers, &winners](std::size_t p_index) { assign_point_to_cluster(p_index, p_centers, winners); }); for (std::size_t index_point = 0; index_point < winners.size(); index_point++) { const std::size_t suitable_index_cluster = winners[index_point]; p_clusters[suitable_index_cluster].push_back(index_point); } } else { /* This part of code is used by X-Means and in case of parallel implementation of this part in scope of X-Means performance is slightly reduced. Experiments has been performed our implementation and Intel TBB library. But in K-Means case only - it works perfectly and increase performance. */ std::vector<std::size_t> winners(data.size(), 0); parallel_for_each(*m_ptr_indexes, [this, &p_centers, &winners](std::size_t p_index) { assign_point_to_cluster(p_index, p_centers, winners); }); for (std::size_t index_point : *m_ptr_indexes) { const std::size_t suitable_index_cluster = winners[index_point]; p_clusters[suitable_index_cluster].push_back(index_point); } } erase_empty_clusters(p_clusters); }
void kmedians::update_clusters(const dataset & medians, cluster_sequence & clusters) { const dataset & data = *m_ptr_data; clusters.clear(); clusters.resize(medians.size()); for (size_t index_point = 0; index_point < data.size(); index_point++) { size_t index_cluster_optim = 0; double distance_optim = std::numeric_limits<double>::max(); for (size_t index_cluster = 0; index_cluster < medians.size(); index_cluster++) { double distance = m_metric(data[index_point], medians[index_cluster]); if (distance < distance_optim) { index_cluster_optim = index_cluster; distance_optim = distance; } } clusters[index_cluster_optim].push_back(index_point); } erase_empty_clusters(clusters); }
void kmeans::assign_point_to_cluster(const std::size_t p_index_point, const dataset & p_centers, std::vector<std::size_t> & p_clusters) { double minimum_distance = std::numeric_limits<double>::max(); size_t suitable_index_cluster = 0; for (size_t index_cluster = 0; index_cluster < p_centers.size(); index_cluster++) { double distance = m_metric(p_centers[index_cluster], (*m_ptr_data)[p_index_point]); if (distance < minimum_distance) { minimum_distance = distance; suitable_index_cluster = index_cluster; } } p_clusters[p_index_point] = suitable_index_cluster; }
/** * Constructor which learns a Chow-Liu tree from the given dataset. * @param X Variables over which to learn a tree. * @param ds Dataset to use for computing marginals. */ chow_liu(const forward_range<typename F::variable_type*>& X_, const dataset<>& ds, const parameters& params = parameters()) : params(params) { typedef typename F::variable_type variable_type; assert(ds.size() > 0); std::vector<variable_type*> X(X_.begin(), X_.end()); if (X.size() == 0) return; // g will hold weights (mutual information) and factors F for each edge. typedef std::pair<double, F> edge_mi_pot; typedef undirected_graph<variable_type*, void_, edge_mi_pot> ig_type; ig_type g; foreach(variable_type* v, X) g.add_vertex(v); for (size_t i(0); i < X.size() - 1; ++i) { for (size_t j(i+1); j < X.size(); ++j) { typename F::domain_type edge_dom(make_domain<variable_type>(X[i],X[j])); F f((params.lambda < 0 ? learn_factor<F>::learn_marginal(edge_dom, ds) : learn_factor<F>::learn_marginal(edge_dom, ds, params.lambda))); double mi(f.mutual_information(make_domain(X[i]), make_domain(X[j]))); g.add_edge(X[i], X[j], std::make_pair(mi, f)); if (params.retain_edge_score_mapping) { edge_score_mapping_[edge_dom] = mi; } } } // Create a MST over the graph g. std::vector<F> mst_factors; kruskal_minimum_spanning_tree (g, transformed_output(back_inserter(mst_factors), impl::mst_edge2f_functor<F>(g)), impl::mst_weight_functor<F>(g)); // Create a decomposable model consisting of the cliques in mst_edges model_ *= mst_factors; }
void ASSERT_CLUSTER_NOISE_SIZES( const dataset & p_data, const cluster_sequence & p_actual_clusters, const std::vector<std::size_t> & p_expected_cluster_length, const noise & p_actual_noise, const std::size_t & p_expected_noise_length, const index_sequence & p_indexes) { if (p_expected_cluster_length.empty() && p_actual_clusters.empty()) { return; } std::size_t total_size = 0; std::unordered_map<std::size_t, bool> unique_objects; std::vector<std::size_t> obtained_cluster_length; for (auto & cluster : p_actual_clusters) { total_size += cluster.size(); obtained_cluster_length.push_back(cluster.size()); for (auto index_object : cluster) { unique_objects[index_object] = false; } } total_size += p_actual_noise.size(); for (auto index_object : p_actual_noise) { unique_objects[index_object] = false; } ASSERT_EQ(total_size, unique_objects.size()); if (!p_expected_cluster_length.empty()) { std::size_t expected_total_size = std::accumulate(p_expected_cluster_length.cbegin(), p_expected_cluster_length.cend(), (std::size_t) 0); if (p_expected_noise_length != (std::size_t) -1) { expected_total_size += p_expected_noise_length; } ASSERT_EQ(expected_total_size, total_size); std::sort(obtained_cluster_length.begin(), obtained_cluster_length.end()); std::vector<size_t> sorted_expected_cluster_length(p_expected_cluster_length); std::sort(sorted_expected_cluster_length.begin(), sorted_expected_cluster_length.end()); for (size_t i = 0; i < obtained_cluster_length.size(); i++) { ASSERT_EQ(obtained_cluster_length[i], sorted_expected_cluster_length[i]); } } else { if (!p_indexes.empty()) { ASSERT_EQ(p_indexes.size(), unique_objects.size()); for (auto index : p_indexes) { ASSERT_TRUE( unique_objects.find(index) != unique_objects.cend() ); } } else { ASSERT_EQ(p_data.size(), total_size); } } if (p_expected_noise_length != (std::size_t) -1) { ASSERT_EQ(p_expected_noise_length, p_actual_noise.size()); } }