Esempio n. 1
0
void bicriteria_as_coreset(
    const wplist& src,
    wplist bic,
    const csize_t dstsize,
    wplist& dst) {
  typedef wplist::const_iterator citer;
  typedef wplist::iterator iter;
  bic.resize(dstsize - dst.size());
  for (iter it = bic.begin(); it != bic.end(); ++it) {
    it->weight = 0;
  }
  for (iter it = dst.begin(); it != dst.end(); ++it) {
    pair<int, double> m = min_dist(*it, bic);
    bic[m.first].weight -= it->weight;
  }
  for (citer it = src.begin(); it != src.end(); ++it) {
    pair<int, double> m = min_dist(*it, bic);
    bic[m.first].weight += it->weight;
  }
  std::copy(bic.begin(), bic.begin()+dstsize-dst.size(),
            std::back_inserter(dst));
  for (iter it = dst.begin(); it != dst.end(); ++it) {
    if (it->weight < 0) {
      it->weight = 0;
    }
  }
}
void kmeans_clustering_method::do_batch_update(wplist& points) {
  static jubatus::util::math::random::mtrand r;
  bool terminated = false;
  if (points.size() < k_) {
    return;
  }
  while (!terminated) {
    vector<common::sfv_t> kcenters_new(k_);
    vector<double> center_count(k_, 0);
    for (wplist::iterator it = points.begin(); it != points.end(); ++it) {
      pair<int64_t, double> m = min_dist((*it).data, kcenters_);
      scalar_mul_and_add(it->data, it->weight, kcenters_new[m.first]);
      center_count[m.first] += it->weight;
    }
    terminated = true;
    for (size_t i = 0; i < k_; ++i) {
      if (center_count[i] == 0) {
        kcenters_new[i] = kcenters_[i];
        continue;
      }
      kcenters_new[i] = scalar_dot(kcenters_new[i], 1.0 / center_count[i]);
      double d = dist(kcenters_new[i], kcenters_[i]);
      if (d > 1e-9) {
        terminated = false;
      }
    }
    kcenters_ = kcenters_new;
  }
}
Esempio n. 3
0
void kmeans_compressor::get_bicriteria(
    const wplist& src, csize_t bsize, csize_t dstsize, wplist& dst) {
  timer_start();
  dst.clear();
  wplist resid = src;
  vector<double> weights(src.size());
  double r = (1 - exp(bsize*(log(bsize)-log(src.size()))/dstsize)) / 2;
  r = max(0.1, r);
  std::vector<size_t> ind(bsize);
  while (resid.size() > 1 && dst.size() < dstsize) {
    timer_start();
    weights.resize(resid.size());
    for (wplist::iterator it = resid.begin(); it != resid.end(); ++it) {
      weights[it - resid.begin()] = it->weight;
    }
    discrete_distribution d(weights.begin(), weights.end());
    std::generate(ind.begin(), ind.end(), d);

    std::sort(ind.begin(), ind.end());
    std::vector<size_t>::iterator it = std::unique(ind.begin(), ind.end());
    ind.erase(it, ind.end());

    for (it = ind.begin(); it != ind.end(); ++it) {
       weighted_point p = resid[*it];
       p.free_long = 0;
       dst.push_back(p);
    }

    for (wplist::iterator itr = resid.begin(); itr != resid.end(); ++itr) {
       (*itr).free_double = min_dist(*itr, dst).second;
    }
    std::sort(resid.begin(), resid.end(), compare_weight);
    csize_t size = (csize_t)std::min(static_cast<double>(resid.size()),
                                     static_cast<double>(resid.size()*r));
    if (size  == 0) {
      size = 1;
    }

    resid.resize(size);
  }
}
void kmeans_clustering_method::initialize_centers(wplist& points) {
  if (points.size() < k_) {
    return;
  }
  kcenters_.clear();
  kcenters_.push_back(points[0].data);
  vector<double> weights;
  while (kcenters_.size() < k_) {
    weights.clear();
    for (wplist::iterator it = points.begin(); it != points.end(); ++it) {
      pair<int64_t, double> m = min_dist((*it).data, kcenters_);
      weights.push_back(m.second * it->weight);
    }
    discrete_distribution d(weights.begin(), weights.end());
    kcenters_.push_back(points[d()].data);
  }
}
Esempio n. 5
0
void kmeans_compressor::bicriteria_to_coreset(
    wplist& src,
    wplist& bicriteria,
    csize_t dstsize,
    wplist& dst) {
  if (bicriteria.size() == 0) {
    dst = src;
    return;
  }
  double weight_sum = 0;
  double squared_min_dist_sum = 0;
  for (wplist::iterator it = src.begin(); it != src.end(); ++it) {
    std::pair<int, double> m = min_dist(*it, bicriteria);
    (*it).free_long = m.first;
    (*it).free_double = m.second;
    bicriteria.at((*it).free_long).free_double += (*it).weight;
    squared_min_dist_sum += pow((*it).free_double, 2) * (*it).weight;
    weight_sum += (*it).weight;
  }
  std::vector<double> weights;
  double sumw = 0;
  double prob = 0;
  for (wplist::iterator it = src.begin(); it != src.end(); ++it) {
    weighted_point p = *it;
    weighted_point bp = bicriteria.at(p.free_long);
    prob = get_probability(p, bp, weight_sum, squared_min_dist_sum);
    (*it).free_double = prob;
    weights.push_back(prob);
    sumw += prob;
  }

  discrete_distribution d(weights.begin(), weights.end());

  std::vector<size_t> ind(dstsize);
  std::generate(ind.begin(), ind.end(), d);

  for (std::vector<size_t>::iterator it = ind.begin(); it != ind.end(); ++it) {
    weighted_point sample = src.at(*it);
    sample.weight = 1.0 / dstsize * sumw / sample.free_double * sample.weight;
    sample.free_double = 0;
    sample.free_long = 0;
    dst.push_back(sample);
  }
}
eigen_wsvec_list_t eigen_feature_mapper::convert(
    const wplist& src,
    bool update_map) {
  eigen_wsvec_list_t ret(src.size());
  eigen_wsvec_list_t::iterator ob = ret.begin();
  wplist::const_iterator ib = src.begin();
  while (ib != src.end()) {
    *ob = convert(*ib, update_map);
    ++ob;
    ++ib;
  }
  for (ob = ret.begin(); ob != ret.end(); ++ob) {
    eigen_svec_t v(d_);
    for (eigen_svec_t::InnerIterator it(ob->data); it; ++it) {
      v.coeffRef(it.index()) = it.value();
    }
    ob->data = v;
  }
  return ret;
}
Esempio n. 7
0
void kmeans_compressor::compress(
    const wplist& src,
    csize_t bsize,
    csize_t dstsize,
    wplist& dst) {

  if (dstsize >= src.size()) {
    concat(src, dst);
    return;
  }
  timer_start();
  wplist bicriteria;
  timer_start();
  get_bicriteria(src, bsize, dstsize, bicriteria);
  if (bicriteria.size() < dstsize) {
    wplist srcclone = src;
    timer_start();
    bicriteria_to_coreset(srcclone, bicriteria,
                          dstsize - bicriteria.size(), dst);
  }
  bicriteria_as_coreset(src, bicriteria, dstsize, dst);

  return;
}