std::shared_ptr<unity_sgraph_base> unity_sgraph::lambda_triple_apply_native(const function_closure_info& toolkit_fn_name, const std::vector<std::string>& mutated_fields) { auto native_execute_function = get_unity_global_singleton() ->get_toolkit_function_registry() ->get_native_function(toolkit_fn_name); log_func_entry(); auto lambda = [=](edge_triple& args)->void { std::vector<variant_type> var(3); var[0] = to_variant(_map_to_flex_dict(std::move(args.source))); var[1] = to_variant(_map_to_flex_dict(std::move(args.edge))); var[2] = to_variant(_map_to_flex_dict(std::move(args.target))); variant_type ret = native_execute_function(var); var = variant_get_value<std::vector<variant_type>>(ret); args.source = _map_from_flex_dict(variant_get_value<flexible_type>(var[0])); args.edge = _map_from_flex_dict(variant_get_value<flexible_type>(var[1])); args.target = _map_from_flex_dict(variant_get_value<flexible_type>(var[2])); }; return lambda_triple_apply_native(lambda, mutated_fields); }
std::shared_ptr<unity_sgraph_base> unity_sgraph::lambda_triple_apply(const std::string& lambda_str, const std::vector<std::string>& mutated_fields) { log_func_entry(); if (mutated_fields.empty()) { log_and_throw("mutated_fields cannot be empty"); } std::shared_ptr<sgraph> g = std::make_shared<sgraph>((*m_graph)()); std::vector<std::string> mutated_vertex_fields, mutated_edge_fields; const auto& all_vertex_fields = g->get_vertex_fields(); const auto& all_edge_fields = g->get_edge_fields(); std::set<std::string> all_vertex_field_set(all_vertex_fields.begin(), all_vertex_fields.end()); std::set<std::string> all_edge_field_set(all_edge_fields.begin(), all_edge_fields.end()); for (auto& f : mutated_fields) { if (f == sgraph::VID_COLUMN_NAME || f == sgraph::SRC_COLUMN_NAME || f == sgraph::DST_COLUMN_NAME) { log_and_throw("mutated fields cannot contain id field: " + f); } if (!all_vertex_field_set.count(f) && !all_edge_field_set.count(f)) { log_and_throw("mutated field \"" + f + "\" cannot be found in graph"); } if (all_vertex_field_set.count(f)) mutated_vertex_fields.push_back(f); if (all_edge_field_set.count(f)) mutated_edge_fields.push_back(f); } DASSERT_FALSE(mutated_fields.empty()); sgraph_compute::triple_apply(*g, lambda_str, mutated_vertex_fields, mutated_edge_fields); std::shared_ptr<unity_sgraph> ret(new unity_sgraph(g)); return ret; }
bool unity_sgraph::load_graph(std::string target_dir) { log_func_entry(); try { dir_archive dir; dir.open_directory_for_read(target_dir); std::string contents; if (dir.get_metadata("contents", contents) == false || contents != "graph") { log_and_throw(std::string("Archive does not contain a graph.")); } iarchive iarc(dir); load(iarc); dir.close(); } catch (std::ios_base::failure& e) { std::string message = "Unable to load graph from " + sanitize_url(target_dir) + ": " + e.what(); log_and_throw_io_failure(message); } catch (std::string& e) { std::string message = "Unable to load graph from " + sanitize_url(target_dir) + ": " + e; log_and_throw(message); } catch (...) { std::string message = "Unable to load graph from " + sanitize_url(target_dir) + ": Unknown Error."; log_and_throw(message); } return true; }
void comm_server::start() { log_func_entry(); if (!started) { pollset->start_poll_thread(); started = true; } }
bool unity_sgraph::save_graph(std::string target, std::string format) { log_func_entry(); try { if (format == "binary") { dir_archive dir; dir.open_directory_for_write(target); dir.set_metadata("contents", "graph"); oarchive oarc(dir); if (dir.get_output_stream()->fail()) { log_and_throw_io_failure("Fail to write"); } save(oarc); dir.close(); } else if (format == "json") { save_sgraph_to_json(get_graph(), target); } else if (format == "csv") { save_sgraph_to_csv(get_graph(), target); } else { log_and_throw("Unable to save to format : " + format); } } catch (std::ios_base::failure& e) { std::string message = "Unable to save graph to " + sanitize_url(target) + ": " + e.what(); log_and_throw_io_failure(message); } catch (std::string& e) { std::string message = "Unable to save graph to " + sanitize_url(target) + ": " + e; log_and_throw(message); } catch (...) { std::string message = "Unable to save graph to " + sanitize_url(target) + ": Unknown Error."; log_and_throw(message); } return true; }
options_map_t unity_sgraph::summary() { log_func_entry(); options_map_t ret; auto& g = (*m_graph)(); ret["num_vertices"] = g.num_vertices(); ret["num_edges"] = g.num_edges(); return ret; }
void comm_server::start() { log_func_entry(); if (!started) { control_socket->start_polling(); object_socket->start_polling(); started = true; } }
void _insert_sframe(std::shared_ptr<unity_sframe_base> sf, const std::string &table_name, bool append_if_exists) { log_func_entry(); logstream(LOG_INFO) << "append: " << append_if_exists << std::endl; auto sf_derived = std::dynamic_pointer_cast<unity_sframe>(sf); auto real_sf = sf_derived->get_underlying_sframe(); m_db_connector.insert_data(*real_sf, table_name, append_if_exists); }
flexible_type unity_global::eval_lambda(const std::string& string, const flexible_type& arg) { log_func_entry(); lambda::pylambda_master& evaluator = lambda::pylambda_master::get_instance(); auto lambda_hash = evaluator.make_lambda(string); std::vector<flexible_type> return_val = evaluator.bulk_eval(lambda_hash, {arg}, false, 0); evaluator.release_lambda(lambda_hash); return return_val[0]; }
void comm_server::stop() { log_func_entry(); if (started) { started = false; } // Attempt to cancel any currently running command get_srv_running_command().store((unsigned long long)uint64_t(-1)); }
std::shared_ptr<unity_sgraph_base> unity_global::load_graph(std::string fname) { log_func_entry(); std::shared_ptr<unity_sgraph> g(new unity_sgraph()); try { g->load_graph(fname); } catch (...) { throw; } return g; }
flexible_type unity_global::eval_dict_lambda(const std::string& pylambda_string, const std::vector<std::string>& keys, const std::vector<flexible_type>& values) { log_func_entry(); lambda::pylambda_master& evaluator = lambda::pylambda_master::get_instance(); auto lambda_hash = evaluator.make_lambda(pylambda_string); std::vector<flexible_type> return_val = evaluator.bulk_eval(lambda_hash, keys, {values}, false, 0); evaluator.release_lambda(lambda_hash); return return_val[0]; }
std::shared_ptr<unity_sgraph_base> unity_sgraph::rename_edge_fields(const std::vector<std::string>& oldnames, const std::vector<std::string>& newnames) { log_func_entry(); std::lock_guard<mutex> lock(dag_access_mutex); sgraph* new_graph = new sgraph((*m_graph)()); new_graph->rename_edge_fields(oldnames, newnames); std::shared_ptr<unity_sgraph> g(new unity_sgraph(*this)); g->m_graph.reset(unity_sgraph::get_dag()->add_value(new_graph)); return g; }
std::shared_ptr<unity_sgraph_base> unity_sgraph::select_vertex_fields( const std::vector<std::string>& fields, size_t group) { log_func_entry(); std::lock_guard<mutex> lock(dag_access_mutex); std::vector<std::string> fields_with_id({sgraph::VID_COLUMN_NAME}); fields_with_id.insert(fields_with_id.end(), fields.begin(), fields.end()); std::shared_ptr<unity_sgraph> g(new unity_sgraph(*this)); g->m_graph.reset(unity_sgraph::get_dag()->add_operation( new select_vertex_fields_op(fields_with_id, group), {m_graph.get()})); return g; }
bool toolkit_function_registry::register_toolkit_function( toolkit_function_specification spec, std::string prefix) { log_func_entry(); // if there is something in the registry with this name, fail if (prefix.length() > 0) { spec.name = prefix + "." + spec.name; } if (registry.count(spec.name)) return false; registry[spec.name] = spec; return true; }
std::shared_ptr<unity_sgraph_base> unity_sgraph::swap_vertex_fields(const std::string& field1, const std::string& field2) { log_func_entry(); std::lock_guard<mutex> lock(dag_access_mutex); if (field1 == sgraph::VID_COLUMN_NAME || field2 == sgraph::VID_COLUMN_NAME) { log_and_throw("Cannot swap id fields " + field1 + " , " + field2); } sgraph* new_graph = new sgraph((*m_graph)()); new_graph->swap_vertex_fields(field1, field2); std::shared_ptr<unity_sgraph> g(new unity_sgraph(*this)); g->m_graph.reset(unity_sgraph::get_dag()->add_value(new_graph)); return g; }
bool toolkit_function_registry::unregister_toolkit_function(std::string name) { log_func_entry(); // look for the name auto iter = registry.find(name); if (iter != registry.end()) { // found! erase registry.erase(iter); return true; } else { // not found! fail return false; } }
bool toolkit_class_registry::register_toolkit_class( const std::string& class_name, std::function<model_base*()> constructor, std::map<std::string, flexible_type> description) { log_func_entry(); if (registry.count(class_name)) { return false; } else { registry[class_name] = constructor; description["name"] = class_name; descriptions[class_name] = description; return true; } }
std::vector<flexible_type> unity_global::parallel_eval_lambda(const std::string& string, const std::vector<flexible_type>& arg) { log_func_entry(); lambda::pylambda_master& evaluator = lambda::pylambda_master::get_instance(); auto lambda_hash = evaluator.make_lambda(string); std::vector<flexible_type> ret(arg.size()); ret.reserve(arg.size()); parallel_for (0, arg.size(), [&](size_t i) { ret[i] = evaluator.bulk_eval(lambda_hash, {arg[i]}, false, 0)[0]; }); evaluator.release_lambda(lambda_hash); return ret; }
std::shared_ptr<unity_sgraph_base> unity_sgraph::delete_vertex_field(const std::string field, size_t group) { log_func_entry(); std::lock_guard<mutex> lock(dag_access_mutex); if (field == sgraph::VID_COLUMN_NAME) { log_and_throw("Cannot delete required field " + field); } std::shared_ptr<unity_sgraph> g(new unity_sgraph(*this)); g->m_graph.reset(unity_sgraph::get_dag()->add_operation( new delete_vertex_field_op(field, group), {m_graph.get()})); return g; }
std::shared_ptr<unity_sgraph_base> unity_sgraph::select_edge_fields(const std::vector<std::string>& fields, size_t groupa, size_t groupb) { log_func_entry(); std::lock_guard<mutex> lock(dag_access_mutex); std::vector<std::string> fields_with_id({sgraph::SRC_COLUMN_NAME, sgraph::DST_COLUMN_NAME}); fields_with_id.insert(fields_with_id.end(), fields.begin(), fields.end()); std::shared_ptr<unity_sgraph> g(new unity_sgraph(*this)); g->m_graph.reset(unity_sgraph::get_dag()->add_operation( new select_edge_fields_op(fields_with_id, groupa, groupb), {m_graph.get()})); std::map<std::string, flex_type_enum> new_field_type_map; return g; }
std::shared_ptr<unity_sgraph_base> unity_sgraph::add_vertex_field( std::shared_ptr<unity_sarray_base> in_column_data, std::string field) { log_func_entry(); std::lock_guard<mutex> lock(dag_access_mutex); if (field == sgraph::VID_COLUMN_NAME) { log_and_throw("Cannot add id field " + field); } sgraph* new_graph = new sgraph((*m_graph)()); std::shared_ptr<unity_sarray> column_data = std::static_pointer_cast<unity_sarray>(in_column_data); new_graph->add_vertex_field(column_data->get_underlying_sarray(), field); std::shared_ptr<unity_sgraph> g(new unity_sgraph(*this)); g->m_graph.reset(unity_sgraph::get_dag()->add_value(new_graph)); return g; }
comm_server::~comm_server() { log_func_entry(); stop(); object_socket->close(); control_socket->close(); publishsock->close(); registered_objects.clear(); delete object_socket; delete control_socket; delete publishsock; for (auto& dispatcher: dispatch_map) { delete dispatcher.second; } registered_objects.clear(); }
void unity_sgraph::load(iarchive& iarc) { log_func_entry(); std::lock_guard<mutex> lock(dag_access_mutex); char buf[256] = ""; size_t magic_header_size = strlen(GRAPH_MAGIC_HEADER); iarc.read(buf, magic_header_size); if (strcmp(buf, GRAPH_MAGIC_HEADER)) { log_and_throw(std::string("Invalid graph file.")); } size_t num_partitions = 0; iarc >> num_partitions; sgraph* g = new sgraph(num_partitions); iarc >> *g; m_graph.reset(unity_sgraph::get_dag()->add_value(g)); }
std::shared_ptr<unity_sgraph_base> unity_sgraph::add_vertices( std::shared_ptr<unity_sframe_base> vertices, const std::string& id_field_name, size_t group) { log_func_entry(); std::lock_guard<mutex> lock(dag_access_mutex); std::shared_ptr<unity_sframe> unity_sf = std::static_pointer_cast<unity_sframe>(vertices); ASSERT_TRUE(unity_sf != nullptr); std::shared_ptr<sframe> sf = unity_sf->get_underlying_sframe(); fast_validate_add_vertices(*sf, id_field_name, group); std::shared_ptr<unity_sgraph> g(new unity_sgraph(*this)); g->m_graph.reset(unity_sgraph::get_dag()-> add_operation(new add_vertices_op<sframe>(sf, id_field_name, group), {m_graph.get()})); return g; }
bool toolkit_function_registry::register_toolkit_function( std::vector<toolkit_function_specification> specvec, std::string prefix) { log_func_entry(); // if there is something in the registry with this name, fail for (auto& spec: specvec) { if (prefix.length() > 0) { spec.name = prefix + "." + spec.name; } if (registry.count(spec.name)) return false; } // now register for (const auto& spec: specvec) { registry[spec.name] = spec; } return true; }
comm_server::~comm_server() { log_func_entry(); stop(); object_socket->close(); control_socket->close(); publishsock->close(); registered_objects.clear(); delete object_socket; delete control_socket; delete publishsock; delete pollset; for (auto& dispatcher: dispatch_map) { delete dispatcher.second; } if (keyval != NULL) delete keyval; registered_objects.clear(); zmq_ctx_destroy(zmq_ctx); }
std::shared_ptr<unity_sgraph_base> unity_sgraph::copy_edge_field(const std::string field, const std::string newfield, size_t groupa, size_t groupb) { log_func_entry(); std::lock_guard<mutex> lock(dag_access_mutex); if (field == newfield) { log_and_throw("Cannot copy to the same field"); } if (newfield == sgraph::SRC_COLUMN_NAME || newfield == sgraph::DST_COLUMN_NAME) { log_and_throw("Cannot copy to required field " + newfield); } std::shared_ptr<unity_sgraph> g(new unity_sgraph(*this)); g->m_graph.reset(unity_sgraph::get_dag()->add_operation( new copy_edge_field_op(field, newfield, groupa, groupb), {m_graph.get()})); return g; }
std::shared_ptr<sframe> sort( std::shared_ptr<lazy_sframe> sframe_ptr, const std::vector<std::string>& sort_column_names, const std::vector<bool>& sort_orders) { log_func_entry(); // get sort column indexes from column names and also check column types std::vector<size_t> sort_column_indexes(sort_column_names.size()); std::vector<flex_type_enum> supported_types = {flex_type_enum::STRING, flex_type_enum::INTEGER, flex_type_enum::FLOAT,flex_type_enum::DATETIME}; std::set<flex_type_enum> supported_type_set(supported_types.begin(), supported_types.end()); for(size_t i = 0; i < sort_column_names.size(); i++) { sort_column_indexes[i] = sframe_ptr->column_index(sort_column_names[i]); auto col_type = sframe_ptr->column_type(sort_column_indexes[i]); if (supported_type_set.count(col_type) == 0) { log_and_throw("Only column with type 'int', 'float', 'string', and 'datetime' can be sorted. Column '" + sort_column_names[i] + "'' is type: " + flex_type_enum_to_name(col_type)); } } // Estimate the size of the sframe so that we could decide number of // chunks. To account for strings, we estimate each cell is 64 bytes. // I'd love to estimate better. size_t estimated_sframe_size = sframe_num_cells(sframe_ptr) * 64.0; size_t num_partitions = std::ceil((1.0 * estimated_sframe_size) / sframe_config::SFRAME_SORT_BUFFER_SIZE); // Make partitions small enough for each thread to (theoretically) sort at once num_partitions = num_partitions * thread::cpu_count(); // If we have more partitions than this, we could run into open file // descriptor limits num_partitions = std::min<size_t>(num_partitions, SFRAME_SORT_MAX_SEGMENTS); DASSERT_TRUE(num_partitions > 0); // Shortcut -- if only one partition, do a in memory sort and we are done if (num_partitions <= thread::cpu_count()) { logstream(LOG_INFO) << "Sorting SFrame in memory" << std::endl; return sframe_sort_impl::sort_sframe_in_memory(sframe_ptr, sort_column_indexes, sort_orders); } // This is a collection of partition keys sorted in the required order. // Each key is a flex_list value that contains the spliting value for // each sort column. Together they defines the "cut line" for all rows in // the SFrame. std::vector<flexible_type> partition_keys; // Do a quantile sketch on the sort columns to figure out the "splitting" points // for the SFrame timer ti; bool all_sorted = sframe_sort_impl::get_partition_keys( sframe_ptr->select_columns(sort_column_names), sort_orders, num_partitions, // in parameters partition_keys); // out parameters logstream(LOG_INFO) << "Pivot estimation step: " << ti.current_time() << std::endl; // In rare case all values in the SFrame are the same, so no need to sort if (all_sorted) return sframe_ptr->get_sframe_ptr(); // scatter partition the sframe into multiple chunks, chunks are relatively // sorted, but each chunk is not sorted. The sorting of each chunk is delayed // until it is consumed. Each chunk is stored as one segment for a sarray. // The chunk stores a serailized version of key and value std::vector<size_t> partition_sizes; // In the case where all sort keys in a given partition are the same, then // there is no need to sort the partition. This information is derived from // scattering std::vector<bool> partition_sorted(num_partitions, true); ti.start(); auto partition_array = sframe_sort_impl::scatter_partition( sframe_ptr, sort_column_indexes, sort_orders, partition_keys, partition_sizes, partition_sorted); logstream(LOG_INFO) << "Scatter step: " << ti.current_time() << std::endl; // return a lazy sframe_ptr that would emit the sorted data lazily auto lazy_sort = std::make_shared<le_sort>( partition_array, partition_sorted, partition_sizes, sort_column_indexes, sort_orders, sframe_ptr->column_names(), sframe_ptr->column_types()); return lazy_sort->eager_sort(); }
unity_global::unity_global(toolkit_function_registry* _toolkit_functions, toolkit_class_registry* _classes, cppipc::comm_server* server) :toolkit_functions(_toolkit_functions), classes(_classes), server(server) { log_func_entry(); }