float recommender_base::calc_similality(common::sfv_t& q1, common::sfv_t& q2) { float q1_norm = calc_l2norm(q1); float q2_norm = calc_l2norm(q2); if (q1_norm == 0.f || q2_norm == 0.f) { return 0.f; } sort(q1.begin(), q1.end()); sort(q2.begin(), q2.end()); size_t i1 = 0; size_t i2 = 0; float ret = 0.f; while (i1 < q1.size() && i2 < q2.size()) { const string& ind1 = q1[i1].first; const string& ind2 = q2[i2].first; if (ind1 < ind2) { ++i1; } else if (ind1 > ind2) { ++i2; } else { ret += q1[i1].second * q2[i2].second; ++i1; ++i2; } } return ret / q1_norm / q2_norm; }
float classifier_base::squared_norm(const common::sfv_t& fv) { float ret = 0.f; for (size_t i = 0; i < fv.size(); ++i) { ret += fv[i].second * fv[i].second; } return ret; }
float classifier_base::calc_margin_and_variance( const common::sfv_t& sfv, const string& label, string& incorrect_label, float& var) const { float margin = calc_margin(sfv, label, incorrect_label); var = 0.f; for (size_t i = 0; i < sfv.size(); ++i) { const string& feature = sfv[i].first; const float val = sfv[i].second; feature_val2_t weight_covars; storage_->get2(feature, weight_covars); float label_covar = 1.f; float incorrect_label_covar = 1.f; for (size_t j = 0; j < weight_covars.size(); ++j) { if (weight_covars[j].first == label) { label_covar = weight_covars[j].second.v2; } else if (weight_covars[j].first == incorrect_label) { incorrect_label_covar = weight_covars[j].second.v2; } } var += (label_covar + incorrect_label_covar) * val * val; } return margin; }
static float squared_norm(const common::sfv_t& fv) { float norm = 0.f; for (size_t i = 0; i < fv.size(); ++i) { norm += fv[i].second * fv[i].second; } return norm; }
void recommender_base::complete_row(const common::sfv_t& query, common::sfv_t& ret) const { ret.clear(); vector<pair<string, float> > ids; similar_row(query, ids, complete_row_similar_num_); if (ids.size() == 0) { return; } size_t exist_row_num = 0; for (size_t i = 0; i < ids.size(); ++i) { common::sfv_t row; orig_.get_row(ids[i].first, row); if (row.size() == 0) { continue; } else { ++exist_row_num; } float ratio = ids[i].second; for (size_t j = 0; j < row.size(); ++j) { ret.push_back(make_pair(row[j].first, row[j].second * ratio)); } } if (exist_row_num == 0) { return; } common::sort_and_merge(ret); for (size_t i = 0; i < ret.size(); ++i) { ret[i].second /= exist_row_num; } }
float recommender_base::calc_l2norm(const common::sfv_t& query) { float ret = 0.f; for (size_t i = 0; i < query.size(); ++i) { ret += query[i].second * query[i].second; } return sqrt(ret); }
float inverted_index_storage::calc_l2norm(const common::sfv_t& sfv) { float ret = 0.f; for (size_t i = 0; i < sfv.size(); ++i) { ret += sfv[i].second * sfv[i].second; } return std::sqrt(ret); }
void inverted_index_storage::calc_scores( const common::sfv_t& query, vector<pair<string, float> >& scores, size_t ret_num) const { float query_norm = calc_l2norm(query); if (query_norm == 0.f) { return; } jubatus::util::data::unordered_map<uint64_t, float> i_scores; for (size_t i = 0; i < query.size(); ++i) { const string& fid = query[i].first; float val = query[i].second; add_inp_scores(fid, val, i_scores); } vector<pair<float, uint64_t> > sorted_scores; for (jubatus::util::data::unordered_map<uint64_t, float>:: const_iterator it = i_scores.begin(); it != i_scores.end(); ++it) { float norm = calc_columnl2norm(it->first); float normed_score = (norm != 0.f) ? it->second / norm / query_norm : 0.f; sorted_scores.push_back(make_pair(normed_score, it->first)); } sort(sorted_scores.rbegin(), sorted_scores.rend()); for (size_t i = 0; i < sorted_scores.size() && i < ret_num; ++i) { scores.push_back( make_pair(column2id_.get_key(sorted_scores[i].second), sorted_scores[i].first)); } }
void revert_feature(const common::sfv_t& fv, fv_converter::datum& data) { for (size_t i = 0; i < fv.size(); ++i) { std::pair<std::string, double> num_value; std::pair<std::string, std::string> string_value; if (revert_num_value(fv[i], num_value)) { data.num_values_.push_back(num_value); } else if (revert_string_value(fv[i], string_value)) { data.string_values_.push_back(string_value); } } }
vector<float> random_projection(const common::sfv_t& sfv, uint32_t hash_num) { vector<float> proj(hash_num); for (size_t i = 0; i < sfv.size(); ++i) { const uint32_t seed = common::hash_util::calc_string_hash(sfv[i].first); jubatus::util::math::random::mtrand rnd(seed); for (uint32_t j = 0; j < hash_num; ++j) { proj[j] += sfv[i].second * rnd.next_gaussian(); } } return proj; }
vector<float> euclid_lsh::calculate_lsh(const common::sfv_t& query) const { vector<float> hash(mixable_storage_->get_model()->all_lsh_num()); for (size_t i = 0; i < query.size(); ++i) { const uint32_t seed = common::hash_util::calc_string_hash(query[i].first); const vector<float> proj = get_projection(seed); for (size_t j = 0; j < hash.size(); ++j) { hash[j] += query[i].second * proj[j]; } } for (size_t j = 0; j < hash.size(); ++j) { hash[j] /= bin_width_; } return hash; }
void inverted_index_classifier::train( const common::sfv_t& fv, const std::string& label) { std::string id; { util::concurrent::scoped_lock lk(rand_mutex_); id = make_id_from_label(label, rand_); } { util::concurrent::scoped_wlock lk(storage_mutex_); storage::inverted_index_storage& inv = *mixable_storage_->get_model(); for (size_t i = 0; i < fv.size(); ++i) { inv.set(fv[i].first, id, fv[i].second); } } set_label(label); labels_.get_model()->increment(label); }
void lsh::generate_column_bases(const common::sfv_t& sfv) { for (size_t i = 0; i < sfv.size(); ++i) { generate_column_base(sfv[i].first); } }
void feature_hasher::hash_feature_keys(common::sfv_t& fv) const { for (size_t i = 0, size = fv.size(); i < size; ++i) { uint64_t id = common::hash_util::calc_string_hash(fv[i].first) % max_size_; fv[i].first = jubatus::util::lang::lexical_cast<std::string>(id); } }