예제 #1
0
float recommender_base::calc_similality(common::sfv_t& q1, common::sfv_t& q2) {
  float q1_norm = calc_l2norm(q1);
  float q2_norm = calc_l2norm(q2);
  if (q1_norm == 0.f || q2_norm == 0.f) {
    return 0.f;
  }
  sort(q1.begin(), q1.end());
  sort(q2.begin(), q2.end());

  size_t i1 = 0;
  size_t i2 = 0;
  float ret = 0.f;
  while (i1 < q1.size() && i2 < q2.size()) {
    const string& ind1 = q1[i1].first;
    const string& ind2 = q2[i2].first;
    if (ind1 < ind2) {
      ++i1;
    } else if (ind1 > ind2) {
      ++i2;
    } else {
      ret += q1[i1].second * q2[i2].second;
      ++i1;
      ++i2;
    }
  }

  return ret / q1_norm / q2_norm;
}
예제 #2
0
float classifier_base::squared_norm(const common::sfv_t& fv) {
  float ret = 0.f;
  for (size_t i = 0; i < fv.size(); ++i) {
    ret += fv[i].second * fv[i].second;
  }
  return ret;
}
예제 #3
0
float classifier_base::calc_margin_and_variance(
    const common::sfv_t& sfv,
    const string& label,
    string& incorrect_label,
    float& var) const {
  float margin = calc_margin(sfv, label, incorrect_label);
  var = 0.f;

  for (size_t i = 0; i < sfv.size(); ++i) {
    const string& feature = sfv[i].first;
    const float val = sfv[i].second;
    feature_val2_t weight_covars;
    storage_->get2(feature, weight_covars);
    float label_covar = 1.f;
    float incorrect_label_covar = 1.f;
    for (size_t j = 0; j < weight_covars.size(); ++j) {
      if (weight_covars[j].first == label) {
        label_covar = weight_covars[j].second.v2;
      } else if (weight_covars[j].first == incorrect_label) {
        incorrect_label_covar = weight_covars[j].second.v2;
      }
    }
    var += (label_covar + incorrect_label_covar) * val * val;
  }
  return margin;
}
예제 #4
0
static float squared_norm(const common::sfv_t& fv) {
  float norm = 0.f;
  for (size_t i = 0; i < fv.size(); ++i) {
    norm += fv[i].second * fv[i].second;
  }
  return norm;
}
예제 #5
0
void recommender_base::complete_row(const common::sfv_t& query,
                                    common::sfv_t& ret) const {
  ret.clear();
  vector<pair<string, float> > ids;
  similar_row(query, ids, complete_row_similar_num_);
  if (ids.size() == 0) {
    return;
  }

  size_t exist_row_num = 0;
  for (size_t i = 0; i < ids.size(); ++i) {
    common::sfv_t row;
    orig_.get_row(ids[i].first, row);
    if (row.size() == 0) {
      continue;
    } else {
      ++exist_row_num;
    }
    float ratio = ids[i].second;
    for (size_t j = 0; j < row.size(); ++j) {
      ret.push_back(make_pair(row[j].first, row[j].second * ratio));
    }
  }

  if (exist_row_num == 0) {
    return;
  }
  common::sort_and_merge(ret);
  for (size_t i = 0; i < ret.size(); ++i) {
    ret[i].second /= exist_row_num;
  }
}
예제 #6
0
float recommender_base::calc_l2norm(const common::sfv_t& query) {
  float ret = 0.f;
  for (size_t i = 0; i < query.size(); ++i) {
    ret += query[i].second * query[i].second;
  }
  return sqrt(ret);
}
float inverted_index_storage::calc_l2norm(const common::sfv_t& sfv) {
  float ret = 0.f;
  for (size_t i = 0; i < sfv.size(); ++i) {
    ret += sfv[i].second * sfv[i].second;
  }
  return std::sqrt(ret);
}
void inverted_index_storage::calc_scores(
    const common::sfv_t& query,
    vector<pair<string, float> >& scores,
    size_t ret_num) const {
  float query_norm = calc_l2norm(query);
  if (query_norm == 0.f) {
    return;
  }
  jubatus::util::data::unordered_map<uint64_t, float> i_scores;
  for (size_t i = 0; i < query.size(); ++i) {
    const string& fid = query[i].first;
    float val = query[i].second;
    add_inp_scores(fid, val, i_scores);
  }

  vector<pair<float, uint64_t> > sorted_scores;
  for (jubatus::util::data::unordered_map<uint64_t, float>::
      const_iterator it = i_scores.begin(); it != i_scores.end(); ++it) {
    float norm = calc_columnl2norm(it->first);
    float normed_score = (norm != 0.f) ? it->second / norm / query_norm : 0.f;
    sorted_scores.push_back(make_pair(normed_score, it->first));
  }
  sort(sorted_scores.rbegin(), sorted_scores.rend());
  for (size_t i = 0; i < sorted_scores.size() && i < ret_num; ++i) {
    scores.push_back(
        make_pair(column2id_.get_key(sorted_scores[i].second),
                  sorted_scores[i].first));
  }
}
예제 #9
0
void revert_feature(const common::sfv_t& fv, fv_converter::datum& data) {
  for (size_t i = 0; i < fv.size(); ++i) {
    std::pair<std::string, double> num_value;
    std::pair<std::string, std::string> string_value;
    if (revert_num_value(fv[i], num_value)) {
      data.num_values_.push_back(num_value);
    } else if (revert_string_value(fv[i], string_value)) {
      data.string_values_.push_back(string_value);
    }
  }
}
예제 #10
0
vector<float> random_projection(const common::sfv_t& sfv, uint32_t hash_num) {
  vector<float> proj(hash_num);
  for (size_t i = 0; i < sfv.size(); ++i) {
    const uint32_t seed = common::hash_util::calc_string_hash(sfv[i].first);
    jubatus::util::math::random::mtrand rnd(seed);
    for (uint32_t j = 0; j < hash_num; ++j) {
      proj[j] += sfv[i].second * rnd.next_gaussian();
    }
  }
  return proj;
}
예제 #11
0
vector<float> euclid_lsh::calculate_lsh(const common::sfv_t& query) const {
  vector<float> hash(mixable_storage_->get_model()->all_lsh_num());
  for (size_t i = 0; i < query.size(); ++i) {
    const uint32_t seed = common::hash_util::calc_string_hash(query[i].first);
    const vector<float> proj = get_projection(seed);
    for (size_t j = 0; j < hash.size(); ++j) {
      hash[j] += query[i].second * proj[j];
    }
  }
  for (size_t j = 0; j < hash.size(); ++j) {
    hash[j] /= bin_width_;
  }
  return hash;
}
void inverted_index_classifier::train(
    const common::sfv_t& fv,
    const std::string& label) {
  std::string id;
  {
    util::concurrent::scoped_lock lk(rand_mutex_);
    id = make_id_from_label(label, rand_);
  }
  {
    util::concurrent::scoped_wlock lk(storage_mutex_);
    storage::inverted_index_storage& inv = *mixable_storage_->get_model();
    for (size_t i = 0; i < fv.size(); ++i) {
      inv.set(fv[i].first, id, fv[i].second);
    }
  }
  set_label(label);
  labels_.get_model()->increment(label);
}
예제 #13
0
파일: lsh.cpp 프로젝트: hido/jubatus_core
void lsh::generate_column_bases(const common::sfv_t& sfv) {
  for (size_t i = 0; i < sfv.size(); ++i) {
    generate_column_base(sfv[i].first);
  }
}
예제 #14
0
void feature_hasher::hash_feature_keys(common::sfv_t& fv) const {
  for (size_t i = 0, size = fv.size(); i < size; ++i) {
    uint64_t id = common::hash_util::calc_string_hash(fv[i].first) % max_size_;
    fv[i].first = jubatus::util::lang::lexical_cast<std::string>(id);
  }
}