void bicriteria_as_coreset( const wplist& src, wplist bic, const csize_t dstsize, wplist& dst) { typedef wplist::const_iterator citer; typedef wplist::iterator iter; bic.resize(dstsize - dst.size()); for (iter it = bic.begin(); it != bic.end(); ++it) { it->weight = 0; } for (iter it = dst.begin(); it != dst.end(); ++it) { pair<int, double> m = min_dist(*it, bic); bic[m.first].weight -= it->weight; } for (citer it = src.begin(); it != src.end(); ++it) { pair<int, double> m = min_dist(*it, bic); bic[m.first].weight += it->weight; } std::copy(bic.begin(), bic.begin()+dstsize-dst.size(), std::back_inserter(dst)); for (iter it = dst.begin(); it != dst.end(); ++it) { if (it->weight < 0) { it->weight = 0; } } }
void kmeans_clustering_method::do_batch_update(wplist& points) { static jubatus::util::math::random::mtrand r; bool terminated = false; if (points.size() < k_) { return; } while (!terminated) { vector<common::sfv_t> kcenters_new(k_); vector<double> center_count(k_, 0); for (wplist::iterator it = points.begin(); it != points.end(); ++it) { pair<int64_t, double> m = min_dist((*it).data, kcenters_); scalar_mul_and_add(it->data, it->weight, kcenters_new[m.first]); center_count[m.first] += it->weight; } terminated = true; for (size_t i = 0; i < k_; ++i) { if (center_count[i] == 0) { kcenters_new[i] = kcenters_[i]; continue; } kcenters_new[i] = scalar_dot(kcenters_new[i], 1.0 / center_count[i]); double d = dist(kcenters_new[i], kcenters_[i]); if (d > 1e-9) { terminated = false; } } kcenters_ = kcenters_new; } }
void kmeans_compressor::get_bicriteria( const wplist& src, csize_t bsize, csize_t dstsize, wplist& dst) { timer_start(); dst.clear(); wplist resid = src; vector<double> weights(src.size()); double r = (1 - exp(bsize*(log(bsize)-log(src.size()))/dstsize)) / 2; r = max(0.1, r); std::vector<size_t> ind(bsize); while (resid.size() > 1 && dst.size() < dstsize) { timer_start(); weights.resize(resid.size()); for (wplist::iterator it = resid.begin(); it != resid.end(); ++it) { weights[it - resid.begin()] = it->weight; } discrete_distribution d(weights.begin(), weights.end()); std::generate(ind.begin(), ind.end(), d); std::sort(ind.begin(), ind.end()); std::vector<size_t>::iterator it = std::unique(ind.begin(), ind.end()); ind.erase(it, ind.end()); for (it = ind.begin(); it != ind.end(); ++it) { weighted_point p = resid[*it]; p.free_long = 0; dst.push_back(p); } for (wplist::iterator itr = resid.begin(); itr != resid.end(); ++itr) { (*itr).free_double = min_dist(*itr, dst).second; } std::sort(resid.begin(), resid.end(), compare_weight); csize_t size = (csize_t)std::min(static_cast<double>(resid.size()), static_cast<double>(resid.size()*r)); if (size == 0) { size = 1; } resid.resize(size); } }
void kmeans_clustering_method::initialize_centers(wplist& points) { if (points.size() < k_) { return; } kcenters_.clear(); kcenters_.push_back(points[0].data); vector<double> weights; while (kcenters_.size() < k_) { weights.clear(); for (wplist::iterator it = points.begin(); it != points.end(); ++it) { pair<int64_t, double> m = min_dist((*it).data, kcenters_); weights.push_back(m.second * it->weight); } discrete_distribution d(weights.begin(), weights.end()); kcenters_.push_back(points[d()].data); } }
void kmeans_compressor::bicriteria_to_coreset( wplist& src, wplist& bicriteria, csize_t dstsize, wplist& dst) { if (bicriteria.size() == 0) { dst = src; return; } double weight_sum = 0; double squared_min_dist_sum = 0; for (wplist::iterator it = src.begin(); it != src.end(); ++it) { std::pair<int, double> m = min_dist(*it, bicriteria); (*it).free_long = m.first; (*it).free_double = m.second; bicriteria.at((*it).free_long).free_double += (*it).weight; squared_min_dist_sum += pow((*it).free_double, 2) * (*it).weight; weight_sum += (*it).weight; } std::vector<double> weights; double sumw = 0; double prob = 0; for (wplist::iterator it = src.begin(); it != src.end(); ++it) { weighted_point p = *it; weighted_point bp = bicriteria.at(p.free_long); prob = get_probability(p, bp, weight_sum, squared_min_dist_sum); (*it).free_double = prob; weights.push_back(prob); sumw += prob; } discrete_distribution d(weights.begin(), weights.end()); std::vector<size_t> ind(dstsize); std::generate(ind.begin(), ind.end(), d); for (std::vector<size_t>::iterator it = ind.begin(); it != ind.end(); ++it) { weighted_point sample = src.at(*it); sample.weight = 1.0 / dstsize * sumw / sample.free_double * sample.weight; sample.free_double = 0; sample.free_long = 0; dst.push_back(sample); } }
eigen_wsvec_list_t eigen_feature_mapper::convert( const wplist& src, bool update_map) { eigen_wsvec_list_t ret(src.size()); eigen_wsvec_list_t::iterator ob = ret.begin(); wplist::const_iterator ib = src.begin(); while (ib != src.end()) { *ob = convert(*ib, update_map); ++ob; ++ib; } for (ob = ret.begin(); ob != ret.end(); ++ob) { eigen_svec_t v(d_); for (eigen_svec_t::InnerIterator it(ob->data); it; ++it) { v.coeffRef(it.index()) = it.value(); } ob->data = v; } return ret; }
void kmeans_compressor::compress( const wplist& src, csize_t bsize, csize_t dstsize, wplist& dst) { if (dstsize >= src.size()) { concat(src, dst); return; } timer_start(); wplist bicriteria; timer_start(); get_bicriteria(src, bsize, dstsize, bicriteria); if (bicriteria.size() < dstsize) { wplist srcclone = src; timer_start(); bicriteria_to_coreset(srcclone, bicriteria, dstsize - bicriteria.size(), dst); } bicriteria_as_coreset(src, bicriteria, dstsize, dst); return; }