예제 #1
0
void silhouette_ksearch::process(const dataset & p_data, silhouette_ksearch_data & p_result) {
    if (m_kmax > p_data.size()) {
        throw std::invalid_argument("K max value '" + std::to_string(m_kmax) + 
            "' should be bigger than amount of objects '" + std::to_string(p_data.size()) + "' in input data.");
    }

    p_result.scores().reserve(m_kmax - m_kmin);

    for (std::size_t k = m_kmin; k < m_kmax; k++) {
        cluster_sequence clusters;
        m_allocator->allocate(k, p_data, clusters);

        if (clusters.size() != k) {
            p_result.scores().push_back(std::nan("1"));
            continue;
        }
        
        silhouette_data result;
        silhouette().process(p_data, clusters, result);

        const double score = std::accumulate(result.get_score().begin(), result.get_score().end(), (double) 0.0) / result.get_score().size();
        p_result.scores().push_back(score);

        if (score > p_result.get_score()) {
            p_result.set_amount(k);
            p_result.set_score(score);
        }
    }
}
예제 #2
0
void kmeans::update_clusters(const dataset & p_centers, cluster_sequence & p_clusters) {
    const dataset & data = *m_ptr_data;

    p_clusters.clear();
    p_clusters.resize(p_centers.size());

    /* fill clusters again in line with centers. */
    if (m_ptr_indexes->empty()) {
        std::vector<std::size_t> winners(data.size(), 0);
        parallel_for(std::size_t(0), data.size(), [this, &p_centers, &winners](std::size_t p_index) {
            assign_point_to_cluster(p_index, p_centers, winners);
        });

        for (std::size_t index_point = 0; index_point < winners.size(); index_point++) {
            const std::size_t suitable_index_cluster = winners[index_point];
            p_clusters[suitable_index_cluster].push_back(index_point);
        }
    }
    else {
        /* This part of code is used by X-Means and in case of parallel implementation of this part in scope of X-Means
           performance is slightly reduced. Experiments has been performed our implementation and Intel TBB library. 
           But in K-Means case only - it works perfectly and increase performance. */
        std::vector<std::size_t> winners(data.size(), 0);
        parallel_for_each(*m_ptr_indexes, [this, &p_centers, &winners](std::size_t p_index) {
            assign_point_to_cluster(p_index, p_centers, winners);
        });

        for (std::size_t index_point : *m_ptr_indexes) {
            const std::size_t suitable_index_cluster = winners[index_point];
            p_clusters[suitable_index_cluster].push_back(index_point);
        }
    }

    erase_empty_clusters(p_clusters);
}
예제 #3
0
void kmedians::update_clusters(const dataset & medians, cluster_sequence & clusters) {
    const dataset & data = *m_ptr_data;

    clusters.clear();
    clusters.resize(medians.size());

    for (size_t index_point = 0; index_point < data.size(); index_point++) {
        size_t index_cluster_optim = 0;
        double distance_optim = std::numeric_limits<double>::max();

        for (size_t index_cluster = 0; index_cluster < medians.size(); index_cluster++) {
            double distance = m_metric(data[index_point], medians[index_cluster]);
            if (distance < distance_optim) {
                index_cluster_optim = index_cluster;
                distance_optim = distance;
            }
        }

        clusters[index_cluster_optim].push_back(index_point);
    }

    erase_empty_clusters(clusters);
}
예제 #4
0
void kmeans::assign_point_to_cluster(const std::size_t p_index_point, const dataset & p_centers, std::vector<std::size_t> & p_clusters) {
    double    minimum_distance = std::numeric_limits<double>::max();
    size_t    suitable_index_cluster = 0;

    for (size_t index_cluster = 0; index_cluster < p_centers.size(); index_cluster++) {
        double distance = m_metric(p_centers[index_cluster], (*m_ptr_data)[p_index_point]);

        if (distance < minimum_distance) {
            minimum_distance = distance;
            suitable_index_cluster = index_cluster;
        }
    }

    p_clusters[p_index_point] = suitable_index_cluster;
}
예제 #5
0
파일: chow_liu.hpp 프로젝트: vdeepak13/sill
    /**
     * Constructor which learns a Chow-Liu tree from the given dataset.
     * @param X             Variables over which to learn a tree.
     * @param ds            Dataset to use for computing marginals.
     */
    chow_liu(const forward_range<typename F::variable_type*>& X_,
             const dataset<>& ds, const parameters& params = parameters())
      : params(params) {

      typedef typename F::variable_type variable_type;
      assert(ds.size() > 0);
      std::vector<variable_type*> X(X_.begin(), X_.end());
      if (X.size() == 0)
        return;

      // g will hold weights (mutual information) and factors F for each edge.
      typedef std::pair<double, F> edge_mi_pot;
      typedef undirected_graph<variable_type*, void_, edge_mi_pot> ig_type;
      ig_type g;
      foreach(variable_type* v, X)
        g.add_vertex(v);
      for (size_t i(0); i < X.size() - 1; ++i) {
        for (size_t j(i+1); j < X.size(); ++j) {
          typename F::domain_type
            edge_dom(make_domain<variable_type>(X[i],X[j]));
          F f((params.lambda < 0 ?
               learn_factor<F>::learn_marginal(edge_dom, ds) :
               learn_factor<F>::learn_marginal(edge_dom, ds, params.lambda)));
          double mi(f.mutual_information(make_domain(X[i]), make_domain(X[j])));
          g.add_edge(X[i], X[j], std::make_pair(mi, f));
          if (params.retain_edge_score_mapping) {
            edge_score_mapping_[edge_dom] = mi;
          }
        }
      }

      // Create a MST over the graph g.
      std::vector<F> mst_factors;
      kruskal_minimum_spanning_tree
        (g, transformed_output(back_inserter(mst_factors),
                               impl::mst_edge2f_functor<F>(g)),
         impl::mst_weight_functor<F>(g));

      // Create a decomposable model consisting of the cliques in mst_edges
      model_ *= mst_factors;
    }
예제 #6
0
void ASSERT_CLUSTER_NOISE_SIZES(
    const dataset & p_data,
    const cluster_sequence & p_actual_clusters,
    const std::vector<std::size_t> & p_expected_cluster_length,
    const noise & p_actual_noise,
    const std::size_t & p_expected_noise_length,
    const index_sequence & p_indexes)
{
    if (p_expected_cluster_length.empty() && p_actual_clusters.empty()) {
        return;
    }

    std::size_t total_size = 0;
    std::unordered_map<std::size_t, bool> unique_objects;
    std::vector<std::size_t> obtained_cluster_length;

    for (auto & cluster : p_actual_clusters) {
        total_size += cluster.size();

        obtained_cluster_length.push_back(cluster.size());

        for (auto index_object : cluster) {
            unique_objects[index_object] = false;
        }
    }

    total_size += p_actual_noise.size();
    for (auto index_object : p_actual_noise) {
        unique_objects[index_object] = false;
    }

    ASSERT_EQ(total_size, unique_objects.size());

    if (!p_expected_cluster_length.empty()) {
        std::size_t expected_total_size = std::accumulate(p_expected_cluster_length.cbegin(), p_expected_cluster_length.cend(), (std::size_t) 0);
        if (p_expected_noise_length != (std::size_t) -1) {
             expected_total_size += p_expected_noise_length;
        }

        ASSERT_EQ(expected_total_size, total_size);

        std::sort(obtained_cluster_length.begin(), obtained_cluster_length.end());

        std::vector<size_t> sorted_expected_cluster_length(p_expected_cluster_length);
        std::sort(sorted_expected_cluster_length.begin(), sorted_expected_cluster_length.end());

        for (size_t i = 0; i < obtained_cluster_length.size(); i++) {
            ASSERT_EQ(obtained_cluster_length[i], sorted_expected_cluster_length[i]);
        }
    }
    else
    {
        if (!p_indexes.empty()) {
            ASSERT_EQ(p_indexes.size(), unique_objects.size());

            for (auto index : p_indexes) {
                ASSERT_TRUE( unique_objects.find(index) != unique_objects.cend() );
            }
        }
        else {
            ASSERT_EQ(p_data.size(), total_size);
        }
    }

    if (p_expected_noise_length != (std::size_t) -1) {
        ASSERT_EQ(p_expected_noise_length, p_actual_noise.size());
    }
}