void unity_sgraph::fast_validate_add_edges(const sframe& edges, std::string src_field, std::string dst_field, size_t groupa, size_t groupb) const { if (!edges.contains_column(src_field)) { log_and_throw("Input sframe does not contain source id column: " + src_field); } if (!edges.contains_column(dst_field)) { log_and_throw("Input sframe does not contain target id column: " + dst_field); } flex_type_enum src_id_type = edges.column_type(edges.column_index(src_field)); flex_type_enum dst_id_type = edges.column_type(edges.column_index(dst_field)); if (src_id_type != dst_id_type) { std::string msg = "Source and target ids have different types: "; msg += std::string(flex_type_enum_to_name(src_id_type)) + " != " + flex_type_enum_to_name(dst_id_type); log_and_throw(msg); } if (src_id_type != flex_type_enum::INTEGER && src_id_type != flex_type_enum::STRING) { log_and_throw( std::string("Invalid id column type : ") + flex_type_enum_to_name(src_id_type) + ". Supported types are: integer and string." ); } }
void check_operation_feasibility(flex_type_enum left, flex_type_enum right, std::string op) { bool operation_is_feasible = false; if (left == flex_type_enum::VECTOR || right == flex_type_enum::VECTOR) { // special handling for vectors // we can perform every numeric op against numbers if (left == flex_type_enum::VECTOR || left == flex_type_enum::INTEGER || left == flex_type_enum::FLOAT) { if (right == flex_type_enum::VECTOR || right == flex_type_enum::INTEGER || right == flex_type_enum::FLOAT) { operation_is_feasible = true; } } } else if (op == "+" || op == "-" || op == "*" || op == "/") { operation_is_feasible = flex_type_has_binary_op(left, right, op[0]); } else if (op == "+" || op == "-" || op == "*" || op == "/" || op == "%") { operation_is_feasible = left == flex_type_enum::INTEGER && right == flex_type_enum::INTEGER; } else if (op == "<" || op == ">" || op == "<=" || op == ">=") { // the comparison operators are all compatible. we just need to check // the < operator operation_is_feasible = flex_type_has_binary_op(left, right, '<'); } else if (op == "==" || op == "!=") { // equality comparison is always feasible operation_is_feasible = true; } else if (op == "&" || op == "|") { // boolean operations are always feasible operation_is_feasible = true; } else if (op == "in") { operation_is_feasible = left == flex_type_enum::STRING && right == flex_type_enum::STRING; } else { log_and_throw("Invalid scalar operation"); } if (!operation_is_feasible) { throw std::string("Unsupported type operation. cannot perform operation ") + op + " between " + flex_type_enum_to_name(left) + " and " + flex_type_enum_to_name(right); } }
void unity_sgraph::fast_validate_add_vertices(const sframe& vertices, std::string id_field, size_t group) const { if (!vertices.contains_column(id_field)) { log_and_throw("Input sframe does not contain id column: " + id_field); } flex_type_enum id_type = vertices.column_type(vertices.column_index(id_field)); if (id_type != flex_type_enum::INTEGER && id_type != flex_type_enum::STRING) { log_and_throw( std::string("Invalid id column type : ") + flex_type_enum_to_name(id_type) + ". Supported types are: integer and string." ); } }
std::ostream& operator<<(std::ostream& out, const gl_sarray& other) { auto t = other.head(10); auto dtype = other.dtype(); out << "dtype: " << flex_type_enum_to_name(dtype) << "\n"; out << "Rows: " << other.size() << "\n"; out << "["; bool first = true; for(auto i : t.range_iterator()) { if (!first) out << ","; if (dtype == flex_type_enum::STRING) out << "\""; if (i.get_type() == flex_type_enum::UNDEFINED) out << "None"; else out << i; if (dtype == flex_type_enum::STRING) out << "\""; first = false; } out << "]" << "\n"; return out; }
std::shared_ptr<sframe> sort( std::shared_ptr<lazy_sframe> sframe_ptr, const std::vector<std::string>& sort_column_names, const std::vector<bool>& sort_orders) { log_func_entry(); // get sort column indexes from column names and also check column types std::vector<size_t> sort_column_indexes(sort_column_names.size()); std::vector<flex_type_enum> supported_types = {flex_type_enum::STRING, flex_type_enum::INTEGER, flex_type_enum::FLOAT,flex_type_enum::DATETIME}; std::set<flex_type_enum> supported_type_set(supported_types.begin(), supported_types.end()); for(size_t i = 0; i < sort_column_names.size(); i++) { sort_column_indexes[i] = sframe_ptr->column_index(sort_column_names[i]); auto col_type = sframe_ptr->column_type(sort_column_indexes[i]); if (supported_type_set.count(col_type) == 0) { log_and_throw("Only column with type 'int', 'float', 'string', and 'datetime' can be sorted. Column '" + sort_column_names[i] + "'' is type: " + flex_type_enum_to_name(col_type)); } } // Estimate the size of the sframe so that we could decide number of // chunks. To account for strings, we estimate each cell is 64 bytes. // I'd love to estimate better. size_t estimated_sframe_size = sframe_num_cells(sframe_ptr) * 64.0; size_t num_partitions = std::ceil((1.0 * estimated_sframe_size) / sframe_config::SFRAME_SORT_BUFFER_SIZE); // Make partitions small enough for each thread to (theoretically) sort at once num_partitions = num_partitions * thread::cpu_count(); // If we have more partitions than this, we could run into open file // descriptor limits num_partitions = std::min<size_t>(num_partitions, SFRAME_SORT_MAX_SEGMENTS); DASSERT_TRUE(num_partitions > 0); // Shortcut -- if only one partition, do a in memory sort and we are done if (num_partitions <= thread::cpu_count()) { logstream(LOG_INFO) << "Sorting SFrame in memory" << std::endl; return sframe_sort_impl::sort_sframe_in_memory(sframe_ptr, sort_column_indexes, sort_orders); } // This is a collection of partition keys sorted in the required order. // Each key is a flex_list value that contains the spliting value for // each sort column. Together they defines the "cut line" for all rows in // the SFrame. std::vector<flexible_type> partition_keys; // Do a quantile sketch on the sort columns to figure out the "splitting" points // for the SFrame timer ti; bool all_sorted = sframe_sort_impl::get_partition_keys( sframe_ptr->select_columns(sort_column_names), sort_orders, num_partitions, // in parameters partition_keys); // out parameters logstream(LOG_INFO) << "Pivot estimation step: " << ti.current_time() << std::endl; // In rare case all values in the SFrame are the same, so no need to sort if (all_sorted) return sframe_ptr->get_sframe_ptr(); // scatter partition the sframe into multiple chunks, chunks are relatively // sorted, but each chunk is not sorted. The sorting of each chunk is delayed // until it is consumed. Each chunk is stored as one segment for a sarray. // The chunk stores a serailized version of key and value std::vector<size_t> partition_sizes; // In the case where all sort keys in a given partition are the same, then // there is no need to sort the partition. This information is derived from // scattering std::vector<bool> partition_sorted(num_partitions, true); ti.start(); auto partition_array = sframe_sort_impl::scatter_partition( sframe_ptr, sort_column_indexes, sort_orders, partition_keys, partition_sizes, partition_sorted); logstream(LOG_INFO) << "Scatter step: " << ti.current_time() << std::endl; // return a lazy sframe_ptr that would emit the sorted data lazily auto lazy_sort = std::make_shared<le_sort>( partition_array, partition_sorted, partition_sizes, sort_column_indexes, sort_orders, sframe_ptr->column_names(), sframe_ptr->column_types()); return lazy_sort->eager_sort(); }
gl_sarray gl_sarray::cumulative_aggregate( std::shared_ptr<group_aggregate_value> aggregator) const { flex_type_enum input_type = this->dtype(); flex_type_enum output_type = aggregator->set_input_types({input_type}); if (! aggregator->support_type(input_type)) { std::stringstream ss; ss << "Cannot perform this operation on an SArray of type " << flex_type_enum_to_name(input_type) << "." << std::endl; log_and_throw(ss.str()); } // Empty case. size_t m_size = this->size(); if (m_size == 0) { return gl_sarray({}, output_type); } // Make a copy of an newly initialize aggregate for each thread. size_t n_threads = thread::cpu_count(); gl_sarray_writer writer(output_type, n_threads); std::vector<std::shared_ptr<group_aggregate_value>> aggregators; for (size_t i = 0; i < n_threads; i++) { aggregators.push_back( std::shared_ptr<group_aggregate_value>(aggregator->new_instance())); } // Skip Phases 1,2 when single threaded or more threads than rows. if ((n_threads > 1) && (m_size > n_threads)) { // Phase 1: Compute prefix-sums for each block. in_parallel([&](size_t thread_idx, size_t n_threads) { size_t start_row = thread_idx * m_size / n_threads; size_t end_row = (thread_idx + 1) * m_size / n_threads; for (const auto& v: this->range_iterator(start_row, end_row)) { DASSERT_TRUE(thread_idx < aggregators.size()); if (v != FLEX_UNDEFINED) { aggregators[thread_idx]->add_element_simple(v); } } }); // Phase 2: Combine prefix-sum(s) at the end of each block. for (size_t i = n_threads - 1; i > 0; i--) { for (size_t j = 0; j < i; j++) { DASSERT_TRUE(i < aggregators.size()); DASSERT_TRUE(j < aggregators.size()); aggregators[i]->combine(*aggregators[j]); } } } // Phase 3: Reaggregate with an re-intialized prefix-sum from previous blocks. auto reagg_fn = [&](size_t thread_idx, size_t n_threads) { flexible_type y = FLEX_UNDEFINED; size_t start_row = thread_idx * m_size / n_threads; size_t end_row = (thread_idx + 1) * m_size / n_threads; std::shared_ptr<group_aggregate_value> re_aggregator ( aggregator->new_instance()); // Initialize with the merged value. if (thread_idx >= 1) { DASSERT_TRUE(thread_idx - 1 < aggregators.size()); y = aggregators[thread_idx - 1]->emit(); re_aggregator->combine(*aggregators[thread_idx - 1]); } // Write prefix-sum for (const auto& v: this->range_iterator(start_row, end_row)) { if (v != FLEX_UNDEFINED) { re_aggregator->add_element_simple(v); y = re_aggregator->emit(); } writer.write(y, thread_idx); } }; // Run single threaded if more threads than rows. if (m_size > n_threads) { in_parallel(reagg_fn); } else { reagg_fn(0, 1); } return writer.close(); }