void recommender_base::complete_row(const common::sfv_t& query, common::sfv_t& ret) const { ret.clear(); vector<pair<string, float> > ids; similar_row(query, ids, complete_row_similar_num_); if (ids.size() == 0) { return; } size_t exist_row_num = 0; for (size_t i = 0; i < ids.size(); ++i) { common::sfv_t row; orig_.get_row(ids[i].first, row); if (row.size() == 0) { continue; } else { ++exist_row_num; } float ratio = ids[i].second; for (size_t j = 0; j < row.size(); ++j) { ret.push_back(make_pair(row[j].first, row[j].second * ratio)); } } if (exist_row_num == 0) { return; } common::sort_and_merge(ret); for (size_t i = 0; i < ret.size(); ++i) { ret[i].second /= exist_row_num; } }
eigen_svec_t eigen_feature_mapper::convertc(const common::sfv_t& src) const { eigen_svec_t ret(d_); for (common::sfv_t::const_iterator it = src.begin(); it != src.end(); ++it) { insertc(*it, ret); } return ret; }
void confidence_weighted::update( const common::sfv_t& sfv, float step_width, const string& pos_label, const string& neg_label) { util::concurrent::scoped_wlock lk(storage_->get_lock()); for (common::sfv_t::const_iterator it = sfv.begin(); it != sfv.end(); ++it) { const string& feature = it->first; float val = it->second; storage::feature_val2_t val2; storage_->get2_nolock(feature, val2); storage::val2_t pos_val(0.f, 1.f); storage::val2_t neg_val(0.f, 1.f); ClassifierUtil::get_two(val2, pos_label, neg_label, pos_val, neg_val); const float C = config_.regularization_weight; float covar_pos_step = 2.f * step_width * val * val * C; float covar_neg_step = 2.f * step_width * val * val * C; storage_->set2_nolock( feature, pos_label, storage::val2_t(pos_val.v1 + step_width * pos_val.v2 * val, 1.f / (1.f / pos_val.v2 + covar_pos_step))); if (neg_label != "") { storage_->set2_nolock( feature, neg_label, storage::val2_t(neg_val.v1 - step_width * neg_val.v2 * val, 1.f / (1.f / neg_val.v2 + covar_neg_step))); } } touch(pos_label); }
common::sfv_t scalar_dot(const common::sfv_t& p, double s) { common::sfv_t ret; for (common::sfv_t::const_iterator it = p.begin(); it != p.end(); ++it) { ret.push_back(make_pair((*it).first, (*it).second*s)); } return ret; }
double sum2(const common::sfv_t& p) { double s = 0; for (common::sfv_t::const_iterator it = p.begin(); it != p.end(); ++it) { s += std::pow((*it).second, 2); } return s; }
void local_storage::inp(const common::sfv_t& sfv, map_feature_val1_t& ret) const { ret.clear(); std::vector<float> ret_id(class2id_.size()); for (common::sfv_t::const_iterator it = sfv.begin(); it != sfv.end(); ++it) { const string& feature = it->first; const float val = it->second; id_features3_t::const_iterator it2 = tbl_.find(feature); if (it2 == tbl_.end()) { continue; } const id_feature_val3_t& m = it2->second; for (id_feature_val3_t::const_iterator it3 = m.begin(); it3 != m.end(); ++it3) { ret_id[it3->first] += it3->second.v1 * val; } } for (size_t i = 0; i < ret_id.size(); ++i) { if (ret_id[i] == 0.f) { continue; } ret[class2id_.get_key(i)] = ret_id[i]; } }
void arow::update( const common::sfv_t& sfv, float alpha, float beta, const std::string& pos_label, const std::string& neg_label) { storage::storage_base* sto = get_storage(); for (common::sfv_t::const_iterator it = sfv.begin(); it != sfv.end(); ++it) { const string& feature = it->first; float val = it->second; storage::feature_val2_t ret; sto->get2(feature, ret); storage::val2_t pos_val(0.f, 1.f); storage::val2_t neg_val(0.f, 1.f); ClassifierUtil::get_two(ret, pos_label, neg_label, pos_val, neg_val); sto->set2( feature, pos_label, storage::val2_t( pos_val.v1 + alpha * pos_val.v2 * val, pos_val.v2 - beta * pos_val.v2 * pos_val.v2 * val * val)); if (neg_label != "") { sto->set2( feature, neg_label, storage::val2_t( neg_val.v1 - alpha * neg_val.v2 * val, neg_val.v2 - beta * neg_val.v2 * neg_val.v2 * val * val)); } } }
void local_storage::inp(const common::sfv_t& sfv, map_feature_val1_t& ret) const { ret.clear(); scoped_rlock lk(mutex_); // Use uin64_t map instead of string map as hash function for string is slow jubatus::util::data::unordered_map<uint64_t, double> ret_id; for (common::sfv_t::const_iterator it = sfv.begin(); it != sfv.end(); ++it) { const string& feature = it->first; const double val = it->second; id_features3_t::const_iterator it2 = tbl_.find(feature); if (it2 == tbl_.end()) { continue; } const id_feature_val3_t& m = it2->second; for (id_feature_val3_t::const_iterator it3 = m.begin(); it3 != m.end(); ++it3) { ret_id[it3->first] += it3->second.v1 * val; } } std::vector<std::string> labels = class2id_.get_all_id2key(); for (size_t i = 0; i < labels.size(); ++i) { const std::string& label = labels[i]; uint64_t id = class2id_.get_id_const(label); if (id == common::key_manager::NOTFOUND || ret_id.count(id) == 0) { ret[label] = 0.0; } else { ret[label] = ret_id[id]; } } }
void storage_base::inp(const common::sfv_t& sfv, map_feature_val1_t& ret) const { ret.clear(); for (common::sfv_t::const_iterator it = sfv.begin(); it != sfv.end(); ++it) { const string& feature = it->first; const float val = it->second; feature_val1_t fval1; get(feature, fval1); for (feature_val1_t::const_iterator it2 = fval1.begin(); it2 != fval1.end(); ++it2) { ret[it2->first] += it2->second * val; } } }
void recommender_base::complete_row(const std::string& id, common::sfv_t& ret) const { ret.clear(); common::sfv_t sfv; orig_.get_row(id, sfv); complete_row(sfv, ret); }
float recommender_base::calc_l2norm(const common::sfv_t& query) { float ret = 0.f; for (size_t i = 0; i < query.size(); ++i) { ret += query[i].second * query[i].second; } return sqrt(ret); }
float inverted_index_storage::calc_l2norm(const common::sfv_t& sfv) { float ret = 0.f; for (size_t i = 0; i < sfv.size(); ++i) { ret += sfv[i].second * sfv[i].second; } return std::sqrt(ret); }
void inverted_index_storage::calc_scores( const common::sfv_t& query, vector<pair<string, float> >& scores, size_t ret_num) const { float query_norm = calc_l2norm(query); if (query_norm == 0.f) { return; } jubatus::util::data::unordered_map<uint64_t, float> i_scores; for (size_t i = 0; i < query.size(); ++i) { const string& fid = query[i].first; float val = query[i].second; add_inp_scores(fid, val, i_scores); } vector<pair<float, uint64_t> > sorted_scores; for (jubatus::util::data::unordered_map<uint64_t, float>:: const_iterator it = i_scores.begin(); it != i_scores.end(); ++it) { float norm = calc_columnl2norm(it->first); float normed_score = (norm != 0.f) ? it->second / norm / query_norm : 0.f; sorted_scores.push_back(make_pair(normed_score, it->first)); } sort(sorted_scores.rbegin(), sorted_scores.rend()); for (size_t i = 0; i < sorted_scores.size() && i < ret_num; ++i) { scores.push_back( make_pair(column2id_.get_key(sorted_scores[i].second), sorted_scores[i].first)); } }
static float squared_norm(const common::sfv_t& fv) { float norm = 0.f; for (size_t i = 0; i < fv.size(); ++i) { norm += fv[i].second * fv[i].second; } return norm; }
void add_feature(const std::string& key, double value, common::sfv_t& ret_fv) const { std::stringstream ss; ss << key << "$" << value; ret_fv.push_back(std::make_pair(ss.str(), static_cast<float>(1.0))); }
float classifier_base::squared_norm(const common::sfv_t& fv) { float ret = 0.f; for (size_t i = 0; i < fv.size(); ++i) { ret += fv[i].second * fv[i].second; } return ret; }
float classifier_base::calc_margin_and_variance( const common::sfv_t& sfv, const string& label, string& incorrect_label, float& var) const { float margin = calc_margin(sfv, label, incorrect_label); var = 0.f; for (size_t i = 0; i < sfv.size(); ++i) { const string& feature = sfv[i].first; const float val = sfv[i].second; feature_val2_t weight_covars; storage_->get2(feature, weight_covars); float label_covar = 1.f; float incorrect_label_covar = 1.f; for (size_t j = 0; j < weight_covars.size(); ++j) { if (weight_covars[j].first == label) { label_covar = weight_covars[j].second.v2; } else if (weight_covars[j].first == incorrect_label) { incorrect_label_covar = weight_covars[j].second.v2; } } var += (label_covar + incorrect_label_covar) * val * val; } return margin; }
void add_feature(const std::string& key, double value, common::sfv_t& ret_fv) const { ret_fv.push_back(std::make_pair( key, static_cast<float>(std::log(std::max(1.0, value))))); }
void eigen_feature_mapper::rinsert( const pair<int, float>& item, common::sfv_t& dst) const { if (rmap_.find(item.first) != rmap_.end()) { dst.push_back( make_pair((rmap_.find(item.first))->second, item.second)); } }
void normal_herd::update( const common::sfv_t& sfv, float margin, float variance, const string& pos_label, const string& neg_label) { storage::storage_base* sto = get_storage(); for (common::sfv_t::const_iterator it = sfv.begin(); it != sfv.end(); ++it) { const string& feature = it->first; float val = it->second; storage::feature_val2_t ret; sto->get2(feature, ret); storage::val2_t pos_val(0.f, 1.f); storage::val2_t neg_val(0.f, 1.f); ClassifierUtil::get_two(ret, pos_label, neg_label, pos_val, neg_val); float val_covariance_pos = val * pos_val.v2; float val_covariance_neg = val * neg_val.v2; const float C = config_.C; sto->set2( feature, pos_label, storage::val2_t( pos_val.v1 + (1.f - margin) * val_covariance_pos / (val_covariance_pos * val + 1.f / C), 1.f / ((1.f / pos_val.v2) + (2 * C + C * C * variance) * val * val))); if (neg_label != "") { sto->set2( feature, neg_label, storage::val2_t( neg_val.v1 - (1.f - margin) * val_covariance_neg / (val_covariance_neg * val + 1.f / C), 1.f / ((1.f / neg_val.v2) + (2 * C + C * C * variance) * val * val))); } } }
void scalar_mul_and_add( const common::sfv_t& left, float s, common::sfv_t& right) { common::sfv_t::const_iterator l = left.begin(); common::sfv_t::iterator r = right.begin(); while (l != left.end() && r != right.end()) { if (l->first < r->first) { std::pair<std::string, float> p = *l; p.second *= s; r = right.insert(r, p); ++l; } else if (l->first > r->first) { ++r; } else { r->second += l->second * s; ++l; ++r; } } for (; l != left.end(); ++l) { std::pair<std::string, float> p = *l; p.second *= s; right.push_back(p); } }
vector<float> random_projection(const common::sfv_t& sfv, uint32_t hash_num) { vector<float> proj(hash_num); for (size_t i = 0; i < sfv.size(); ++i) { const uint32_t seed = common::hash_util::calc_string_hash(sfv[i].first); jubatus::util::math::random::mtrand rnd(seed); for (uint32_t j = 0; j < hash_num; ++j) { proj[j] += sfv[i].second * rnd.next_gaussian(); } } return proj; }
void revert_feature(const common::sfv_t& fv, fv_converter::datum& data) { for (size_t i = 0; i < fv.size(); ++i) { std::pair<std::string, double> num_value; std::pair<std::string, std::string> string_value; if (revert_num_value(fv[i], num_value)) { data.num_values_.push_back(num_value); } else if (revert_string_value(fv[i], string_value)) { data.string_values_.push_back(string_value); } } }
common::sfv_t add(const common::sfv_t& p1, const common::sfv_t& p2) { common::sfv_t ret; common::sfv_t::const_iterator it1 = p1.begin(); common::sfv_t::const_iterator it2 = p2.begin(); while (it1 != p1.end() && it2 != p2.end()) { if ((*it1).first < (*it2).first) { ret.push_back((*it1)); ++it1; } else if ((*it1).first > (*it2).first) { ret.push_back((*it2)); ++it2; } else { ret.push_back(make_pair((*it1).first, (*it1).second + (*it2).second)); ++it1; ++it2; } } for (; it1 != p1.end(); ++it1) { ret.push_back((*it1)); } for (; it2 != p2.end(); ++it2) { ret.push_back((*it2)); } return ret; }
double dist(const common::sfv_t& p1, const common::sfv_t& p2) { double ret = 0; common::sfv_t::const_iterator it1 = p1.begin(); common::sfv_t::const_iterator it2 = p2.begin(); while (it1 != p1.end() && it2 != p2.end()) { int cmp = strcmp(it1->first.c_str(), it2->first.c_str()); if (cmp < 0) { ret += it1->second * it1->second; ++it1; } else if (cmp > 0) { ret += it2->second * it2->second; ++it2; } else { ret += (it1->second - it2->second) * (it1->second - it2->second); ++it1; ++it2; } } for (; it1 != p1.end(); ++it1) { ret += std::pow(it1->second, 2); } for (; it2 != p2.end(); ++it2) { ret += std::pow(it2->second, 2); } return std::sqrt(ret); }
float recommender_base::calc_similality(common::sfv_t& q1, common::sfv_t& q2) { float q1_norm = calc_l2norm(q1); float q2_norm = calc_l2norm(q2); if (q1_norm == 0.f || q2_norm == 0.f) { return 0.f; } sort(q1.begin(), q1.end()); sort(q2.begin(), q2.end()); size_t i1 = 0; size_t i2 = 0; float ret = 0.f; while (i1 < q1.size() && i2 < q2.size()) { const string& ind1 = q1[i1].first; const string& ind2 = q2[i2].first; if (ind1 < ind2) { ++i1; } else if (ind1 > ind2) { ++i2; } else { ret += q1[i1].second * q2[i2].second; ++i1; ++i2; } } return ret / q1_norm / q2_norm; }
void weight_manager::get_weight(common::sfv_t& fv) const { for (common::sfv_t::iterator it = fv.begin(); it != fv.end(); ++it) { double global_weight = get_global_weight(it->first); it->second = static_cast<float>(it->second * global_weight); } fv.erase(remove_if(fv.begin(), fv.end(), is_zero()), fv.end()); }
void arow::update( const common::sfv_t& sfv, double alpha, double beta) { util::concurrent::scoped_wlock lk(storage_->get_lock()); for (common::sfv_t::const_iterator it = sfv.begin(); it != sfv.end(); ++it) { const std::string& feature = it->first; double val = it->second; storage::feature_val2_t val2; storage_->get2_nolock(feature, val2); storage::val2_t current_val(0.0, 1.0); if (val2.size() > 0) { current_val = val2[0].second; } storage_->set2_nolock( feature, "+", storage::val2_t(current_val.v1 + alpha * current_val.v2 * val, current_val.v2 - beta * current_val.v2 * current_val.v2* val * val)); } }
vector<float> euclid_lsh::calculate_lsh(const common::sfv_t& query) const { vector<float> hash(mixable_storage_->get_model()->all_lsh_num()); for (size_t i = 0; i < query.size(); ++i) { const uint32_t seed = common::hash_util::calc_string_hash(query[i].first); const vector<float> proj = get_projection(seed); for (size_t j = 0; j < hash.size(); ++j) { hash[j] += query[i].second * proj[j]; } } for (size_t j = 0; j < hash.size(); ++j) { hash[j] /= bin_width_; } return hash; }
void local_storage::bulk_update( const common::sfv_t& sfv, float step_width, const string& inc_class, const string& dec_class) { uint64_t inc_id = class2id_.get_id(inc_class); typedef common::sfv_t::const_iterator iter_t; if (dec_class != "") { uint64_t dec_id = class2id_.get_id(dec_class); for (iter_t it = sfv.begin(); it != sfv.end(); ++it) { float val = it->second * step_width; id_feature_val3_t& feature_row = tbl_[it->first]; feature_row[inc_id].v1 += val; feature_row[dec_id].v1 -= val; } } else { for (iter_t it = sfv.begin(); it != sfv.end(); ++it) { float val = it->second * step_width; id_feature_val3_t& feature_row = tbl_[it->first]; feature_row[inc_id].v1 += val; } } }