void recommender_base::complete_row(const sfv_t& query, sfv_t& ret) const { ret.clear(); vector<pair<string, float> > ids; similar_row(query, ids, complete_row_similar_num_); if (ids.size() == 0) { return; } size_t exist_row_num = 0; for (size_t i = 0; i < ids.size(); ++i) { sfv_t row; orig_.get_row(ids[i].first, row); if (row.size() == 0) { continue; } else { ++exist_row_num; } float ratio = ids[i].second; for (size_t j = 0; j < row.size(); ++j) { ret.push_back(make_pair(row[j].first, row[j].second * ratio)); } } if (exist_row_num == 0) { return; } sort_and_merge(ret); for (size_t i = 0; i < ret.size(); ++i) { ret[i].second /= exist_row_num; } }
float inverted_index_storage::calc_l2norm(const sfv_t& sfv){ float ret = 0.f; for (size_t i = 0; i < sfv.size(); ++i){ ret += sfv[i].second * sfv[i].second; } return sqrt(ret); }
static float calc_norm(const sfv_t& fv) { float norm = 0; for (size_t i = 0; i < fv.size(); ++i) { norm += fv[i].second * fv[i].second; } return norm; }
float recommender_base::calc_l2norm(const sfv_t& query) { float ret = 0.f; for (size_t i = 0; i < query.size(); ++i) { ret += query[i].second * query[i].second; } return sqrt(ret); }
void inverted_index_storage::calc_scores(const sfv_t& query, vector<pair<string, float> >& scores, size_t ret_num) const { float query_norm = calc_l2norm(query); if (query_norm == 0.f){ return; } pfi::data::unordered_map<uint64_t, float> i_scores; for (size_t i = 0; i < query.size(); ++i){ const string& fid = query[i].first; float val = query[i].second; add_inp_scores(fid, val, i_scores); } vector<pair<float, uint64_t> > sorted_scores; for (pfi::data::unordered_map<uint64_t, float>::const_iterator it = i_scores.begin(); it != i_scores.end(); ++it){ float norm = calc_columnl2norm(it->first); float normed_score = (norm != 0.f) ? it->second / norm / query_norm : 0.f; sorted_scores.push_back(make_pair(normed_score, it->first)); } sort(sorted_scores.rbegin(), sorted_scores.rend()); for (size_t i = 0; i < sorted_scores.size() && i < ret_num; ++i){ scores.push_back(make_pair(column2id_.get_key(sorted_scores[i].second), sorted_scores[i].first)); } }
void AROW::update(const sfv_t& sfv, float alpha, float beta, const std::string& pos_label, const std::string& neg_label){ for (sfv_t::const_iterator it = sfv.begin(); it != sfv.end(); ++it){ const string& feature = it->first; float val = it->second; storage::feature_val2_t ret; storage_->get2(feature, ret); storage::val2_t pos_val(0.f, 1.f); storage::val2_t neg_val(0.f, 1.f); ClassifierUtil::get_two(ret, pos_label, neg_label, pos_val, neg_val); storage_->set2(feature, pos_label, storage::val2_t(pos_val.v1 + alpha * pos_val.v2 * val, pos_val.v2 - beta * pos_val.v2 * pos_val.v2 * val * val)); if (neg_label != "") storage_->set2(feature, neg_label, storage::val2_t(neg_val.v1 - alpha * neg_val.v2 * val, neg_val.v2 - beta * neg_val.v2 * neg_val.v2 * val * val)); } }
void local_storage_mixture::bulk_update(const sfv_t& sfv, float step_width, const string& inc_class, const string& dec_class){ uint64_t inc_id = class2id_.get_id(inc_class); if (dec_class != ""){ uint64_t dec_id = class2id_.get_id(dec_class); for (sfv_t::const_iterator it = sfv.begin(); it != sfv.end(); ++it){ float val = it->second * step_width; id_feature_val3_t& feature_row = tbl_diff_[it->first]; feature_row[inc_id].v1 += val; feature_row[dec_id].v1 -= val; } } else { for (sfv_t::const_iterator it = sfv.begin(); it != sfv.end(); ++it){ float val = it->second * step_width; id_feature_val3_t& feature_row = tbl_diff_[it->first]; feature_row[inc_id].v1 += val; } } }
void sort_and_merge(sfv_t& sfv){ if (sfv.size() == 0) return; sort(sfv.begin(), sfv.end()); sfv_t ret_sfv; const string* prev = &sfv[0].first; float val = sfv[0].second; for (size_t i = 1; i < sfv.size(); ++i){ if (sfv[i].first == *prev){ val += sfv[i].second; } else { ret_sfv.push_back(make_pair(*prev, val)); prev = &sfv[i].first; val = sfv[i].second; } } ret_sfv.push_back(make_pair(*prev, val)); sfv.swap(ret_sfv); }
void local_storage_mixture::inp(const sfv_t& sfv, map_feature_val1_t& ret) { ret.clear(); std::vector<float> ret_id(class2id_.size()); for (sfv_t::const_iterator it = sfv.begin(); it != sfv.end(); ++it){ const string& feature = it->first; const float val = it->second; id_feature_val3_t m; get_internal(feature, m); for (id_feature_val3_t::const_iterator it3 = m.begin(); it3 != m.end(); ++it3){ ret_id[it3->first] += it3->second.v1 * val; } } for (size_t i = 0; i < ret_id.size(); ++i){ if (ret_id[i] == 0.f) continue; ret[class2id_.get_key(i)] = ret_id[i]; } }
void revert_feature(const sfv_t& fv, fv_converter::datum& data) { for (size_t i = 0; i < fv.size(); ++i) { pair<string, float> num_value; pair<string, string> string_value; if (revert_num_value(fv[i], num_value)) { data.num_values_.push_back(num_value); } else if (revert_string_value(fv[i], string_value)) { data.string_values_.push_back(string_value); } } }
float recommender_base::calc_similality(sfv_t& q1, sfv_t& q2) { float q1_norm = calc_l2norm(q1); float q2_norm = calc_l2norm(q2); if (q1_norm == 0.f || q2_norm == 0.f) { return 0.f; } sort(q1.begin(), q1.end()); sort(q2.begin(), q2.end()); size_t i1 = 0; size_t i2 = 0; float ret = 0.f; while (i1 < q1.size() && i2 < q2.size()) { const string& ind1 = q1[i1].first; const string& ind2 = q2[i2].first; if (ind1 < ind2) { ++i1; } else if (ind1 > ind2) { ++i2; } else { ret += q1[i1].second * q2[i2].second; ++i1; ++i2; } } return ret / q1_norm / q2_norm; }
void weight_manager::get_weight(sfv_t& fv) const { for (sfv_t::iterator it = fv.begin(); it != fv.end(); ++it) { double global_weight = get_global_weight(it->first); it->second *= global_weight; } fv.erase(remove_if(fv.begin(), fv.end(), is_zero()), fv.end()); }
void NHERD::update(const sfv_t& sfv, float margin, float variance, const string& pos_label, const string& neg_label){ for (sfv_t::const_iterator it = sfv.begin(); it != sfv.end(); ++it){ const string& feature = it->first; float val = it->second; storage::feature_val2_t ret; storage_->get2(feature, ret); storage::val2_t pos_val(0.f, 1.f); storage::val2_t neg_val(0.f, 1.f); ClassifierUtil::get_two(ret, pos_label, neg_label, pos_val, neg_val); float val_covariance_pos = val * pos_val.v2; float val_covariance_neg = val * neg_val.v2; storage_->set2(feature, pos_label, storage::val2_t(pos_val.v1 + (1.f - margin) * val_covariance_pos / (val_covariance_pos * val + 1.f / C_), 1.f / ((1.f / pos_val.v2) + (2 * C_ + C_ * C_ * variance) * val * val))); if (neg_label != "") storage_->set2(feature, neg_label, storage::val2_t(neg_val.v1 - (1.f - margin) * val_covariance_neg / (val_covariance_neg * val + 1.f / C_), 1.f / ((1.f / neg_val.v2) + (2 * C_ + C_ * C_ * variance) * val * val))); } }
void CW::update(const sfv_t& sfv, float step_width, const string& pos_label, const string& neg_label){ for (sfv_t::const_iterator it = sfv.begin(); it != sfv.end(); ++it){ const string& feature = it->first; float val = it->second; storage::feature_val2_t val2; storage_->get2(feature, val2); storage::val2_t pos_val(0.f, 1.f); storage::val2_t neg_val(0.f, 1.f); ClassifierUtil::get_two(val2, pos_label, neg_label, pos_val, neg_val); const float C = config.C; float covar_pos_step = 2.f * step_width * pos_val.v2 * val * val * C; float covar_neg_step = 2.f * step_width * neg_val.v2 * val * val * C; storage_->set2(feature, pos_label, storage::val2_t(pos_val.v1 + step_width * pos_val.v2 * val, 1.f / (1.f / pos_val.v2 + covar_pos_step))); if (neg_label != "") storage_->set2(feature, neg_label, storage::val2_t(neg_val.v1 - step_width * neg_val.v2 * val, 1.f / (1.f / neg_val.v2 + covar_neg_step))); } }
void minhash::calc_minhash_values(const sfv_t& sfv, bit_vector& bv) const{ vector<float> min_values_buffer(hash_num_, FLT_MAX); vector<uint64_t> hash_buffer(hash_num_); for (size_t i = 0; i < sfv.size(); ++i){ uint64_t key_hash = hash_util::calc_string_hash(sfv[i].first); float val = sfv[i].second; for (uint64_t j = 0; j < hash_num_; ++j){ float hashval = calc_hash(key_hash, j, val); if (hashval < min_values_buffer[j]){ min_values_buffer[j] = hashval; hash_buffer[j] = key_hash; } } } bv.resize_and_clear(hash_num_); for (size_t i = 0; i < hash_buffer.size(); ++i){ if ((hash_buffer[i] & 1LLU) == 1){ bv.set_bit(i); } } }
void sort_and_merge(sfv_t& sfv) { if (sfv.size() <= 1) { return; } sort(sfv.begin(), sfv.end()); typedef sfv_t::iterator iterator; iterator cur = sfv.begin(); iterator end = sfv.end(); for (iterator iter = cur+1; iter != end; ++iter) { if (iter->first == cur->first) { cur->second += iter->second; } else { ++cur; *cur = *iter; } } sfv.erase(cur+1, end); }
void recommender_base::decode_row(const std::string& id, sfv_t& ret) const { ret.clear(); orig_.get_row(id, ret); }
void feature_hasher::hash_feature_keys(sfv_t& fv) const { for (size_t i = 0, size = fv.size(); i < size; ++i) { uint64_t id = hash_util::calc_string_hash(fv[i].first) % max_size_; fv[i].first = pfi::lang::lexical_cast<string>(id); } }
void add_feature(const std::string& key, double value, sfv_t& ret_fv) const { std::stringstream ss; ss << key << "$" << value; ret_fv.push_back(make_pair(ss.str(), 1.0)); }
void keyword_weights::update_document_frequency(const sfv_t& fv) { ++document_count_; for (sfv_t::const_iterator it = fv.begin(); it != fv.end(); ++it) { ++document_frequencies_[it->first]; } }
void recommender_base::complete_row(const std::string& id, sfv_t& ret) const { ret.clear(); sfv_t sfv; orig_.get_row(id, sfv); complete_row(sfv, ret); }
void lsh::generate_column_bases(const sfv_t& sfv){ for (size_t i = 0; i < sfv.size(); ++i){ generate_column_base(sfv[i].first); } }
void add_feature(const std::string& key, double value, sfv_t& ret_fv) const { ret_fv.push_back(make_pair(key, std::log(std::max(1.0, value)))); }