gl_sgraph::gl_sgraph(const gl_sframe& vertex_sframe, const gl_sframe& edge_sframe, const std::string& vid_field, const std::string& src_field, const std::string& dst_field) { instantiate_new(); if (!vertex_sframe.empty()) { m_sgraph = add_vertices(vertex_sframe, vid_field).m_sgraph; } if (!edge_sframe.empty()) { m_sgraph = add_edges(edge_sframe, src_field, dst_field).m_sgraph; } }
gl_sarray predict_sframe(ffm_model *model, gl_sframe data, std::string target_column, std::vector<std::string> feature_columns) { ffm_double loss = 0; vector<ffm_node> x; ffm_int i = 0; size_t target_col_idx = get_column_index(data, target_column); std::vector<size_t> feature_col_idxs; for (auto col : feature_columns) { feature_col_idxs.push_back(get_column_index(data, col)); } gl_sarray_writer f_out(flex_type_enum::FLOAT, 1); size_t index = 0; auto r = data.range_iterator(); auto it = r.begin(); for (; it != r.end(); ++it, ++index) { x.clear(); const std::vector<flexible_type>& row = *it; const auto& yval = row[target_col_idx]; ffm_float y = (yval.get<flex_int>() > 0) ? 1.0f : -1.0f; for (const size_t col_idx : feature_col_idxs) { if (row[col_idx] != FLEX_UNDEFINED) { const flex_dict& dv = row[col_idx].get<flex_dict>(); size_t n_values = dv.size(); for(size_t k = 0; k < n_values; ++k) { const std::pair<flexible_type, flexible_type>& kvp = dv[k]; ffm_node N; N.f = col_idx; N.j = kvp.first.get<flex_int>(); N.v = (float) kvp.second; x.push_back(N); } } } ffm_float y_bar = ffm_predict(x.data(), x.data()+x.size(), model); f_out.write(y_bar, 0); loss -= y==1? log(y_bar) : log(1-y_bar); } loss /= i; logprogress_stream << "logloss = " << fixed << setprecision(5) << loss << endl; return f_out.close(); }
static void _to_serializable(flexible_type& data, schema_t& schema, const gl_sframe& input) { schema.insert(std::make_pair("type", JSON::types::SFRAME)); flex_dict data_dict; flex_list column_names; for (const auto& name : input.column_names()) { column_names.push_back(name); } data_dict.push_back(std::make_pair("column_names", column_names)); std::vector<flexible_type> columns; for (const auto& name : column_names) { const auto& column = input.select_column(name); flexible_type serialized_column; schema_t serialized_schema; _any_to_serializable(serialized_column, serialized_schema, column); columns.push_back(serialized_column); } data_dict.push_back(std::make_pair("columns", columns)); data = data_dict; }
ffm_problem read_sframe(gl_sframe data, std::string target, std::vector<std::string> features, size_t max_field_idx, size_t max_key_idx) { ffm_problem prob; prob.l = data.size(); prob.n = max_key_idx; prob.m = max_field_idx; prob.sf = data; prob.target_column = target; prob.feature_columns = features; return prob; }
/// Public methods void grouped_sframe::group(const gl_sframe &sf, const std::vector<std::string> column_names, bool is_grouped) { if(m_inited) log_and_throw("Group has already been called on this object!"); // Do our "grouping" if it hasn't already been done if(!is_grouped) { m_grouped_sf = sf.sort(column_names); } else { m_grouped_sf = sf; } m_key_col_names = column_names; // Get indices from column names std::vector<size_t> col_ids; std::unordered_set<size_t> dedup_set; for(const auto &i : column_names) { auto col_id = sf.column_index(i); col_ids.push_back(col_id); auto ins_ret = dedup_set.insert(col_id); if(!ins_ret.second) log_and_throw("Found duplicate column name: " + i); } // Build the directory of ranges to allow querying of the groups // (this is an extra, sequential pass over the data) auto sf_range = m_grouped_sf.range_iterator(); auto iter = sf_range.begin(); size_t cnt = 0; std::vector<flexible_type> prev_elem(col_ids.size()); std::vector<flexible_type> cur_elem(col_ids.size()); bool first = true; for(; iter != sf_range.end(); ++iter, ++cnt) { // Create cur_elem int col_cnt = 0; for(const auto &i : col_ids) { cur_elem[col_cnt] = (*iter)[i]; ++col_cnt; } // Check for new group if((cur_elem != prev_elem) || first) { first = false; m_key2range.insert(std::make_pair(cur_elem, m_range_directory.size())); m_range_directory.push_back(cnt); if(cur_elem.size() == 1) m_group_names.push_back(cur_elem[0]); else m_group_names.push_back(cur_elem); } prev_elem = cur_elem; } if(col_ids.size() > 1) { m_group_type = flex_type_enum::LIST; } else { m_group_type = prev_elem[0].get_type(); } m_inited = true; }
void gl_gframe::add_columns(const gl_sframe& data) { for (const auto& k: data.column_names()) { add_column(data[k], k); } }