void bicriteria_as_coreset( const wplist& src, wplist bic, const csize_t dstsize, wplist& dst) { typedef wplist::const_iterator citer; typedef wplist::iterator iter; bic.resize(dstsize - dst.size()); for (iter it = bic.begin(); it != bic.end(); ++it) { it->weight = 0; } for (iter it = dst.begin(); it != dst.end(); ++it) { pair<int, double> m = min_dist(*it, bic); bic[m.first].weight -= it->weight; } for (citer it = src.begin(); it != src.end(); ++it) { pair<int, double> m = min_dist(*it, bic); bic[m.first].weight += it->weight; } std::copy(bic.begin(), bic.begin()+dstsize-dst.size(), std::back_inserter(dst)); for (iter it = dst.begin(); it != dst.end(); ++it) { if (it->weight < 0) { it->weight = 0; } } }
void compressive_storage::forget_weight(wplist& points) { double factor = std::exp(-config_.forgetting_factor); typedef wplist::iterator iter; for (iter it = points.begin(); it != points.end(); ++it) { it->weight *= factor; } }
void kmeans_clustering_method::do_batch_update(wplist& points) { static jubatus::util::math::random::mtrand r; bool terminated = false; if (points.size() < k_) { return; } while (!terminated) { vector<common::sfv_t> kcenters_new(k_); vector<double> center_count(k_, 0); for (wplist::iterator it = points.begin(); it != points.end(); ++it) { pair<int64_t, double> m = min_dist((*it).data, kcenters_); scalar_mul_and_add(it->data, it->weight, kcenters_new[m.first]); center_count[m.first] += it->weight; } terminated = true; for (size_t i = 0; i < k_; ++i) { if (center_count[i] == 0) { kcenters_new[i] = kcenters_[i]; continue; } kcenters_new[i] = scalar_dot(kcenters_new[i], 1.0 / center_count[i]); double d = dist(kcenters_new[i], kcenters_[i]); if (d > 1e-9) { terminated = false; } } kcenters_ = kcenters_new; } }
void compressive_storage::forget_weight(wplist& points) { double lam = config_.forgetting_threshold; typedef wplist::iterator iter; for (iter it = points.begin(); it != points.end(); ++it) { it->weight *= exp(-lam); } }
vector<wplist> kmeans_clustering_method::get_clusters( const wplist& points) const { vector<wplist> ret(k_); for (wplist::const_iterator it = points.begin(); it != points.end(); ++it) { pair<int64_t, double> m = min_dist(it->data, kcenters_); ret[m.first].push_back(*it); } return ret; }
void kmeans_compressor::bicriteria_to_coreset( wplist& src, wplist& bicriteria, csize_t dstsize, wplist& dst) { if (bicriteria.size() == 0) { dst = src; return; } double weight_sum = 0; double squared_min_dist_sum = 0; for (wplist::iterator it = src.begin(); it != src.end(); ++it) { std::pair<int, double> m = min_dist(*it, bicriteria); (*it).free_long = m.first; (*it).free_double = m.second; bicriteria.at((*it).free_long).free_double += (*it).weight; squared_min_dist_sum += pow((*it).free_double, 2) * (*it).weight; weight_sum += (*it).weight; } std::vector<double> weights; double sumw = 0; double prob = 0; for (wplist::iterator it = src.begin(); it != src.end(); ++it) { weighted_point p = *it; weighted_point bp = bicriteria.at(p.free_long); prob = get_probability(p, bp, weight_sum, squared_min_dist_sum); (*it).free_double = prob; weights.push_back(prob); sumw += prob; } discrete_distribution d(weights.begin(), weights.end()); std::vector<size_t> ind(dstsize); std::generate(ind.begin(), ind.end(), d); for (std::vector<size_t>::iterator it = ind.begin(); it != ind.end(); ++it) { weighted_point sample = src.at(*it); sample.weight = 1.0 / dstsize * sumw / sample.free_double * sample.weight; sample.free_double = 0; sample.free_long = 0; dst.push_back(sample); } }
std::pair<size_t, double> min_dist(const weighted_point& d1, const wplist& P) { double md = DBL_MAX; size_t midx = 0; for (wplist::const_iterator it = P.begin(); it != P.end(); ++it) { double d = dist((*it), d1); if (md > d) { midx = it - P.begin(); md = d; } } return std::make_pair(midx, md); }
void kmeans_clustering_method::initialize_centers(wplist& points) { if (points.size() < k_) { kcenters_.clear(); for (wplist::iterator it = points.begin(); it != points.end(); ++it) { kcenters_.push_back(it->data); } return; } kcenters_.clear(); kcenters_.push_back(points[0].data); vector<double> weights; while (kcenters_.size() < k_) { weights.clear(); for (wplist::iterator it = points.begin(); it != points.end(); ++it) { pair<int64_t, double> m = min_dist((*it).data, kcenters_); weights.push_back(m.second * it->weight); } discrete_distribution d(weights.begin(), weights.end()); kcenters_.push_back(points[d()].data); } }
vector<wplist> kmeans_clustering_method::get_clusters( const wplist& points) const { if (kcenters_.empty()) { throw JUBATUS_EXCEPTION(not_performed()); } vector<wplist> ret(k_); for (wplist::const_iterator it = points.begin(); it != points.end(); ++it) { pair<int64_t, double> m = min_dist(it->data, kcenters_); ret[m.first].push_back(*it); } return ret; }
eigen_wsvec_list_t eigen_feature_mapper::convert( const wplist& src, bool update_map) { eigen_wsvec_list_t ret(src.size()); eigen_wsvec_list_t::iterator ob = ret.begin(); wplist::const_iterator ib = src.begin(); while (ib != src.end()) { *ob = convert(*ib, update_map); ++ob; ++ib; } for (ob = ret.begin(); ob != ret.end(); ++ob) { eigen_svec_t v(d_); for (eigen_svec_t::InnerIterator it(ob->data); it; ++it) { v.coeffRef(it.index()) = it.value(); } ob->data = v; } return ret; }
void concat(const wplist& src, wplist& dst) { dst.insert(dst.end(), src.begin(), src.end()); }