void unity_sgraph::fast_validate_add_edges(const sframe& edges, std::string src_field, std::string dst_field, size_t groupa, size_t groupb) const { if (!edges.contains_column(src_field)) { log_and_throw("Input sframe does not contain source id column: " + src_field); } if (!edges.contains_column(dst_field)) { log_and_throw("Input sframe does not contain target id column: " + dst_field); } flex_type_enum src_id_type = edges.column_type(edges.column_index(src_field)); flex_type_enum dst_id_type = edges.column_type(edges.column_index(dst_field)); if (src_id_type != dst_id_type) { std::string msg = "Source and target ids have different types: "; msg += std::string(flex_type_enum_to_name(src_id_type)) + " != " + flex_type_enum_to_name(dst_id_type); log_and_throw(msg); } if (src_id_type != flex_type_enum::INTEGER && src_id_type != flex_type_enum::STRING) { log_and_throw( std::string("Invalid id column type : ") + flex_type_enum_to_name(src_id_type) + ". Supported types are: integer and string." ); } }
void unity_sgraph::fast_validate_add_vertices(const sframe& vertices, std::string id_field, size_t group) const { if (!vertices.contains_column(id_field)) { log_and_throw("Input sframe does not contain id column: " + id_field); } flex_type_enum id_type = vertices.column_type(vertices.column_index(id_field)); if (id_type != flex_type_enum::INTEGER && id_type != flex_type_enum::STRING) { log_and_throw( std::string("Invalid id column type : ") + flex_type_enum_to_name(id_type) + ". Supported types are: integer and string." ); } }
sframe join(sframe& sf_left, sframe& sf_right, std::string join_type, const std::map<std::string,std::string> join_columns, size_t max_buffer_size) { // ***SANITY CHECKS // check that each sframe is valid if(!sf_left.num_rows() || !sf_left.num_columns()) { log_and_throw("Current SFrame has nothing to join!"); } if(!sf_right.num_rows() || !sf_right.num_columns()) { log_and_throw("Given SFrame has nothing to join!"); } std::vector<size_t> left_join_positions; std::vector<size_t> right_join_positions; for(const auto &col_pair : join_columns) { // Check that all columns exist (in both sframes) // These will throw if not found left_join_positions.push_back(sf_left.column_index(col_pair.first)); right_join_positions.push_back(sf_right.column_index(col_pair.second)); // Each column must have matching types to compare effectively if(sf_left.column_type(left_join_positions.back()) != sf_right.column_type(right_join_positions.back())) { log_and_throw("Columns " + col_pair.first + " and " + col_pair.second + " does not have the same type in both SFrames."); } } // Figure out what join type we have to do boost::algorithm::to_lower(join_type); join_type_t in_join_type; if(join_type == "outer") { in_join_type = FULL_JOIN; } else if(join_type == "left") { in_join_type = LEFT_JOIN; } else if(join_type == "right") { in_join_type = RIGHT_JOIN; } else if(join_type == "inner") { in_join_type = INNER_JOIN; } else { log_and_throw("Invalid join type given!"); } // execute join (perhaps multiplex algorithm based on something?) join_impl::hash_join_executor join_executor(sf_left, sf_right, left_join_positions, right_join_positions, in_join_type, max_buffer_size); return join_executor.grace_hash_join(); }
void sframe_reader::init(const sframe& frame, size_t num_segments) { Dlog_func_entry(); typedef sarray_reader<flexible_type> array_reader_type; ASSERT_MSG(!inited, "SFrame reader already inited"); index_info = frame.get_index_info(); // no columns. Just stop. if (index_info.column_names.size() == 0) { m_num_segments = 0; return; } if (num_segments == (size_t)(-1)) { // use the segmentation of the first column m_num_segments = frame.columns[0]->get_index_info().nsegments; std::vector<size_t> segment_sizes = frame.columns[0]->get_index_info().segment_sizes; for (size_t i = 0;i < index_info.column_names.size(); ++i) { column_data.emplace_back(std::move(frame.columns[i]->get_reader(segment_sizes))); } } else { // create num_segments worth of segments m_num_segments = num_segments; for (size_t i = 0;i < index_info.column_names.size(); ++i) { column_data.emplace_back(std::move(frame.columns[i]->get_reader(m_num_segments))); } } }
void subplan_executor::generate_to_sframe_segment(const std::shared_ptr<planner_node>& plan, sframe& out, size_t output_segment_id) { auto outiter = out.get_output_iterator(output_segment_id); generate_to_callback_function( plan, output_segment_id, [&](size_t segment_idx, const std::shared_ptr<sframe_rows>& rows) { (*outiter) = *rows; return false; }); }
void sframe_reader::init(const sframe& frame, const std::vector<size_t>& segment_lengths) { Dlog_func_entry(); typedef sarray_reader<flexible_type> array_reader_type; ASSERT_MSG(!inited, "SFrame reader already inited"); // Verify that lengths match up index_info = frame.get_index_info(); size_t sum = 0; for (size_t s: segment_lengths) sum += s; ASSERT_EQ(sum, size()); m_num_segments = segment_lengths.size(); for (size_t i = 0;i < index_info.column_names.size(); ++i) { column_data.emplace_back(std::move(frame.columns[i]->get_reader(segment_lengths))); } }
std::vector<sframe> shuffle( sframe sframe_in, size_t n, std::function<size_t(const std::vector<flexible_type>&)> hash_fn) { ASSERT_GT(n, 0); // split the work to threads // for n bins let's assign n / log(n) workers, assuming rows are evenly distributed. size_t num_rows = sframe_in.num_rows(); size_t num_workers = graphlab::thread::cpu_count(); size_t rows_per_worker = num_rows / num_workers; // prepare the out sframe std::vector<sframe> sframe_out; std::vector<sframe::iterator> sframe_out_iter; sframe_out.resize(n); for (auto& sf: sframe_out) { sf.open_for_write(sframe_in.column_names(), sframe_in.column_types(), "", 1); sframe_out_iter.push_back(sf.get_output_iterator(0)); } std::vector<std::unique_ptr<std::mutex>> sframe_out_locks; for (size_t i = 0; i < n; ++i) { sframe_out_locks.push_back(std::unique_ptr<std::mutex>(new std::mutex)); } auto reader = sframe_in.get_reader(); parallel_for(0, num_workers, [&](size_t worker_id) { size_t start_row = worker_id * rows_per_worker; size_t end_row = (worker_id == (num_workers-1)) ? num_rows : (worker_id + 1) * rows_per_worker; // prepare thread local output buffer for each sframe std::vector<buffered_writer<std::vector<flexible_type>, sframe::iterator>> writers; for (size_t i = 0; i < n; ++i) { writers.push_back( buffered_writer<std::vector<flexible_type>, sframe::iterator> (sframe_out_iter[i], *sframe_out_locks[i], WRITER_BUFFER_SOFT_LIMIT, WRITER_BUFFER_HARD_LIMIT) ); } std::vector<std::vector<flexible_type>> in_buffer(READER_BUFFER_SIZE); while (start_row < end_row) { // read a chunk of rows to shuffle size_t rows_to_read = std::min<size_t>((end_row - start_row), READER_BUFFER_SIZE); size_t rows_read = reader->read_rows(start_row, start_row + rows_to_read, in_buffer); DASSERT_EQ(rows_read, rows_to_read); start_row += rows_read; for (auto& row : in_buffer) { size_t out_index = hash_fn(row) % n; writers[out_index].write(row); } } // end of while // flush the rest of the buffer for (size_t i = 0; i < n; ++i) { writers[i].flush(); } }); // close all sframe writers for (auto& sf: sframe_out) { sf.close(); } return sframe_out; }
sframe groupby_aggregate(const sframe& source, const std::vector<std::string>& keys, const std::vector<std::string>& output_column_names, const std::vector<std::pair<std::vector<std::string>, std::shared_ptr<group_aggregate_value>>>& groups, size_t max_buffer_size) { // first, sanity checks // check that group keys exist if (output_column_names.size() != groups.size()) { log_and_throw("There must be as many output columns as there are groups"); } { // check that output column names are all unique, and do not intersect with // keys. Since empty values will be automatically assigned, we will skip // those. std::set<std::string> all_output_columns(keys.begin(), keys.end()); size_t named_column_count = 0; for (auto s: output_column_names) { if (!s.empty()) { all_output_columns.insert(s); ++named_column_count; } } if (all_output_columns.size() != keys.size() + named_column_count) { log_and_throw("Output columns names are not unique"); } } for (const auto& key: keys) { // check that the column name is valid if (!source.contains_column(key)) { log_and_throw("SFrame does not contain column " + key); } } // check that each group is valid for (const auto& group: groups) { // check that the column name is valid if (group.first.size() > 0) { for(size_t index = 0; index < group.first.size();index++) { auto& col_name = group.first[index]; if (!source.contains_column(col_name)) { log_and_throw("SFrame does not contain column " + col_name); } if(graphlab::registered_arg_functions.count(group.second->name()) != 0 && index > 0) continue; // check that the types are valid size_t column_number = source.column_index(col_name); if (!group.second->support_type(source.column_type(column_number))) { log_and_throw("Requested operation: " + group.second->name() + " not supported on the type of column " + col_name); } } } } // key should not have repeated columns std::set<std::string> key_columns; std::set<std::string> group_columns; for (const auto& key: keys) key_columns.insert(key); for (const auto& group: groups) { for(auto& col_name : group.first) { group_columns.insert(col_name); } } if (key_columns.size() != keys.size()) { log_and_throw("Group by key cannot have repeated column names"); } // ok. select out just the columns I care about // begin with the key columns std::vector<std::string> all_columns(key_columns.begin(), key_columns.end()); // then all the group columns (as long as they are not also key columns) for (const auto& group_column: group_columns) { if (group_column != "" && key_columns.count(group_column) == 0) { all_columns.push_back(group_column); } } sframe frame_with_relevant_cols = source.select_columns(all_columns); // prepare the output frame sframe output; std::vector<std::string> column_names; std::vector<flex_type_enum> column_types; // output frame has the key column name and types for (const auto& key: key_columns) { column_names.push_back(key); column_types.push_back(source.column_type(source.column_index(key))); } // then for each group, make a unique name and determine the output group type for (size_t i = 0;i < groups.size(); ++i) { const auto& group = groups[i]; std::string candidate_name = output_column_names[i]; if (candidate_name.empty()) { std::string root_candidate_name; if(graphlab::registered_arg_functions.count(group.second->name()) == 0) { for (auto& col_name: group.first) { if (root_candidate_name.empty()) { root_candidate_name += " of " + col_name; } else { root_candidate_name += "_" + col_name; } } root_candidate_name = group.second->name() + root_candidate_name; } else { if(group.first.size() != 2) log_and_throw("arg functions takes exactly two arguments"); root_candidate_name += group.first[1] + " for " + group.second->name() + " of " + group.first[0]; } candidate_name = root_candidate_name; size_t ctr = 1; // keep trying to come up with a unique column name while (std::find(column_names.begin(), column_names.end(), candidate_name) != column_names.end()) { candidate_name = root_candidate_name + "." + std::to_string(ctr); ++ctr; } } column_names.push_back(candidate_name); std::vector<flex_type_enum> input_types; for(auto col_name : group.first) { input_types.push_back(source.column_type(source.column_index(col_name))); } // this statement is valid for argmax and argmin as well, because their // set_input_types(...) simply return input_types. auto output_type = group.second->set_input_types(input_types); column_types.push_back(output_type); } // done! now we can start on the groupby size_t nsegments = frame_with_relevant_cols.num_segments(); // either nsegments, or n*log n buckets nsegments = std::max(nsegments, thread::cpu_count() * std::max<size_t>(1, log2(thread::cpu_count()))); output.open_for_write(column_names, column_types, "", nsegments); groupby_aggregate_impl::group_aggregate_container container(max_buffer_size, nsegments); // ok the input sframe (frame_with_relevant_cols) contains all the values // we care about. However, the challenge here is to figure out how the keys // and values line up. By construction, all the key columns come first. // which is good. But group columns can be pretty much anywhere. size_t num_keys = keys.size(); for (const auto& group: groups) { std::vector<size_t> column_numbers; for(auto& col_name : group.first) { column_numbers.push_back(frame_with_relevant_cols.column_index(col_name)); } container.define_group(column_numbers, group.second); } // done. now we can begin parallel processing // shuffle the rows based on the value of the key column. auto input_reader = frame_with_relevant_cols.get_reader(thread::cpu_count()); graphlab::timer ti; logstream(LOG_INFO) << "Filling group container: " << std::endl; parallel_for (0, input_reader->num_segments(), [&](size_t i) { auto iter = input_reader->begin(i); auto enditer = input_reader->end(i); while(iter != enditer) { auto& row = *iter; container.add(row, num_keys); ++iter; } }); logstream(LOG_INFO) << "Group container filled in " << ti.current_time() << std::endl; logstream(LOG_INFO) << "Writing output: " << std::endl; ti.start(); container.group_and_write(output); logstream(LOG_INFO) << "Output written in: " << ti.current_time() << std::endl; output.close(); return output; }