Пример #1
0
gl_sgraph::gl_sgraph(const gl_sframe& vertex_sframe,
                     const gl_sframe& edge_sframe,
                     const std::string& vid_field,
                     const std::string& src_field,
                     const std::string& dst_field) {
  instantiate_new();
  if (!vertex_sframe.empty()) {
    m_sgraph = add_vertices(vertex_sframe, vid_field).m_sgraph;
  }
  if (!edge_sframe.empty()) {
    m_sgraph = add_edges(edge_sframe, src_field, dst_field).m_sgraph;
  }
}
gl_sarray predict_sframe(ffm_model *model, gl_sframe data, std::string target_column, std::vector<std::string> feature_columns) 
{
  ffm_double loss = 0;
  vector<ffm_node> x;
  ffm_int i = 0;

  size_t target_col_idx = get_column_index(data, target_column); 
  std::vector<size_t> feature_col_idxs;
  for (auto col : feature_columns) { 
    feature_col_idxs.push_back(get_column_index(data, col));
  }

  gl_sarray_writer f_out(flex_type_enum::FLOAT, 1);
  size_t index = 0;
  auto r = data.range_iterator();
  auto it = r.begin();
  for (; it != r.end(); ++it, ++index) { 

    x.clear();

    const std::vector<flexible_type>& row = *it;
    const auto& yval = row[target_col_idx];
    ffm_float y = (yval.get<flex_int>() > 0) ? 1.0f : -1.0f;

    for (const size_t col_idx : feature_col_idxs) { 
      if (row[col_idx] != FLEX_UNDEFINED) {
        const flex_dict& dv = row[col_idx].get<flex_dict>(); 
        size_t n_values = dv.size(); 

        for(size_t k = 0; k < n_values; ++k) { 
          const std::pair<flexible_type, flexible_type>& kvp = dv[k];


          ffm_node N;
          N.f = col_idx; 
          N.j = kvp.first.get<flex_int>(); 
          N.v = (float) kvp.second;

          x.push_back(N);
        }
      }
    }

    ffm_float y_bar = ffm_predict(x.data(), x.data()+x.size(), model);
    f_out.write(y_bar, 0);

    loss -= y==1? log(y_bar) : log(1-y_bar);
  }

  loss /= i;

  logprogress_stream << "logloss = " << fixed << setprecision(5) << loss << endl;

  return f_out.close();
}
Пример #3
0
static void _to_serializable(flexible_type& data, schema_t& schema, const gl_sframe& input) {
  schema.insert(std::make_pair("type", JSON::types::SFRAME));
  flex_dict data_dict;

  flex_list column_names;
  for (const auto& name : input.column_names()) {
    column_names.push_back(name);
  }
  data_dict.push_back(std::make_pair("column_names", column_names));

  std::vector<flexible_type> columns;
  for (const auto& name : column_names) {
    const auto& column = input.select_column(name);
    flexible_type serialized_column;
    schema_t serialized_schema;
    _any_to_serializable(serialized_column, serialized_schema, column);
    columns.push_back(serialized_column);
  }
  data_dict.push_back(std::make_pair("columns", columns));
  data = data_dict;
}
ffm_problem read_sframe(gl_sframe data, std::string target, 
                        std::vector<std::string> features, 
                        size_t max_field_idx, 
                        size_t max_key_idx)
{
    ffm_problem prob;
    prob.l = data.size();
    prob.n = max_key_idx;
    prob.m = max_field_idx;
    prob.sf = data;
    prob.target_column = target;
    prob.feature_columns = features;
    return prob;
}
Пример #5
0
/// Public methods
void grouped_sframe::group(const gl_sframe &sf, const std::vector<std::string>
    column_names, bool is_grouped) {
  if(m_inited)
    log_and_throw("Group has already been called on this object!");

  // Do our "grouping" if it hasn't already been done
  if(!is_grouped) {
    m_grouped_sf = sf.sort(column_names);
  } else {
    m_grouped_sf = sf;
  }
  m_key_col_names = column_names;

  // Get indices from column names
  std::vector<size_t> col_ids;
  std::unordered_set<size_t> dedup_set;
  for(const auto &i : column_names) {
    auto col_id = sf.column_index(i);
    col_ids.push_back(col_id);
    auto ins_ret = dedup_set.insert(col_id);
    if(!ins_ret.second)
      log_and_throw("Found duplicate column name: " + i);
  }

  // Build the directory of ranges to allow querying of the groups
  // (this is an extra, sequential pass over the data)
  auto sf_range = m_grouped_sf.range_iterator();
  auto iter = sf_range.begin();
  size_t cnt = 0;
  std::vector<flexible_type> prev_elem(col_ids.size());
  std::vector<flexible_type> cur_elem(col_ids.size());
  bool first = true;
  for(; iter != sf_range.end(); ++iter, ++cnt) {
    // Create cur_elem
    int col_cnt = 0;
    for(const auto &i : col_ids) {
      cur_elem[col_cnt] = (*iter)[i];
      ++col_cnt;
    }

    // Check for new group
    if((cur_elem != prev_elem) || first) {
      first = false;
      m_key2range.insert(std::make_pair(cur_elem, m_range_directory.size()));
      m_range_directory.push_back(cnt);
      if(cur_elem.size() == 1)
        m_group_names.push_back(cur_elem[0]);
      else
        m_group_names.push_back(cur_elem);
    }

    prev_elem = cur_elem;
  }

  if(col_ids.size() > 1) {
    m_group_type = flex_type_enum::LIST;
  } else {
    m_group_type = prev_elem[0].get_type();
  }

  m_inited = true;
}
Пример #6
0
void gl_gframe::add_columns(const gl_sframe& data) {
  for (const auto& k: data.column_names()) {
    add_column(data[k], k);
  }
}