inline void _debug_check_consistency(std::set<const node_info*>& seen) const { if(seen.count(this)) return; seen.insert(this); DASSERT_TRUE(pnode->operator_type == type); DASSERT_EQ(pnode->inputs.size(), inputs.size()); DASSERT_TRUE(is_source_node() || !inputs.empty()); if(attributes.num_inputs != -1) DASSERT_EQ(inputs.size(), attributes.num_inputs); DASSERT_EQ(num_columns(), infer_planner_node_num_output_columns(pnode)); { // Make sure that all inputs and outputs are consistent. std::map<const node_info*, size_t> input_counts; for(size_t i = 0; i < inputs.size(); ++i) input_counts[inputs[i].get()] += 1; for(size_t i = 0; i < inputs.size(); ++i) { DASSERT_TRUE(pnode->inputs[i] == inputs[i]->pnode); size_t n_present = 0; for(const cnode_info_ptr& out : inputs[i]->outputs) { if(out.get() == this) ++n_present; } DASSERT_EQ(n_present, input_counts.at(inputs[i].get())); } } { // Make sure that all inputs and outputs are consistent. std::map<const node_info*, size_t> output_counts; for(size_t i = 0; i < outputs.size(); ++i) output_counts[outputs[i].get()] += 1; for(size_t i = 0; i < outputs.size(); ++i) { size_t n_present = 0; for(const cnode_info_ptr& out : outputs[i]->inputs) { if(out.get() == this) ++n_present; } DASSERT_EQ(n_present, output_counts.at(outputs[i].get())); } } for(size_t i = 0; i < outputs.size(); ++i) { outputs[i]->_debug_check_consistency(seen); } for(size_t i = 0; i < inputs.size(); ++i) { inputs[i]->_debug_check_consistency(seen); } }
gl_sframe grouped_sframe::group_info() const { if (m_group_names.size() == 0) { log_and_throw("No groups present. Cannot obtain group info."); } // Return column names. std::vector<std::string> ret_column_names = m_key_col_names; ret_column_names.push_back("group_size"); DASSERT_EQ(ret_column_names.size(), m_key_col_names.size() + 1); // Return column types from the first group info. DASSERT_TRUE(m_group_names.size() > 1); std::vector<flex_type_enum> ret_column_types; flexible_type first_key = m_group_names[0]; flex_type_enum key_type = first_key.get_type(); if (key_type == flex_type_enum::LIST) { for (size_t k = 0; k < first_key.size(); k++) { ret_column_types.push_back(first_key.array_at(k).get_type()); } } else { ret_column_types.push_back(key_type); } ret_column_types.push_back(flex_type_enum::INTEGER); DASSERT_EQ(ret_column_types.size(), ret_column_names.size()); // Prepare for writing. size_t num_segments = thread::cpu_count(); gl_sframe_writer writer(ret_column_names, ret_column_types, num_segments); size_t range_dir_size = m_range_directory.size(); // Write the group info. in_parallel([&](size_t thread_idx, size_t num_threads) { size_t start_idx = range_dir_size * thread_idx / num_threads; size_t end_idx = range_dir_size * (thread_idx + 1) / num_threads; for (size_t i = start_idx; i < end_idx; i++) { size_t range_start = m_range_directory[i]; size_t range_end = 0; if((i + 1) == m_range_directory.size()) { range_end = m_grouped_sf.size(); } else { range_end = m_range_directory[i + 1]; } size_t num_rows = range_end - range_start; std::vector<flexible_type> vals = m_group_names[i]; vals.push_back(num_rows); DASSERT_EQ(vals.size(), ret_column_names.size()); writer.write(vals, thread_idx); } return writer.close(); }); }
void continuous::merge_results(std::vector<continuous_result>& thread_results) { for (auto& thread_result : thread_results) { flexible_type combined_min = std::min(m_transformer.min, thread_result.min); flexible_type combined_max = std::max(m_transformer.max, thread_result.max); m_transformer.min = combined_min; m_transformer.max = combined_max; m_transformer.rescale(combined_min, combined_max); thread_result.rescale(combined_min, combined_max); DASSERT_EQ(m_transformer.scale_min, thread_result.scale_min); DASSERT_EQ(m_transformer.scale_max, thread_result.scale_max); for (size_t i=0; i<continuous_result::MAX_BINS; i++) { m_transformer.bins[i] += thread_result.bins[i]; } } }
/** * Write column_names and column_values (as a row in the sframe) to JSONNode. */ void sframe_row_to_json(const std::vector<std::string>& column_names, const std::vector<flexible_type>& column_values, JSONNode& node) { DASSERT_EQ(column_names.size(), column_values.size()); for (size_t i = 0; i < column_names.size(); ++i) { node.push_back(flexible_type_to_json(column_values[i], column_names[i])); } }
/** * Increments the value of a log entry */ void thr_dec_log_entry(size_t entry, double value) { event_log_thread_local_type* ev = get_thread_counter_ref(); DASSERT_LT(entry, MAX_LOG_SIZE); // does not work for cumulative logs DASSERT_NE((int)logs[entry]->logtype, (int) log_type::CUMULATIVE); DASSERT_EQ(logs[entry]->is_callback_entry, false); ev->values[entry] -= value; }
void sort_and_write(SIterableType& out) { parallel_for (0, num_buckets(), [&](size_t i) { buckets[i]->flush();} ); sarray_sink->close(); typedef typename SIterableType::iterator OutIterator; DASSERT_EQ(out.num_segments(), buckets.size()); parallel_for(0, buckets.size(), [&](size_t i) { buckets[i]->template sort_and_write<OutIterator>(out.get_output_iterator(i)); }); out.close(); };
/** * Load the current block */ void parallel_sframe_iterator::load_current_block() { DASSERT_EQ(current_idx, block_end_idx); block_start_idx = current_idx; block_end_idx = std::min(end_idx, block_end_idx + max_block_size); if(block_start_idx == block_end_idx) { for(size_t i = 0; i < buffers.size(); ++i) buffers[i].clear(); } for(size_t i = 0; i < sources.size(); ++i) { sources[i]->read_rows(block_start_idx, block_end_idx, buffers[i]); } }
std::vector<sframe> shuffle( sframe sframe_in, size_t n, std::function<size_t(const std::vector<flexible_type>&)> hash_fn) { ASSERT_GT(n, 0); // split the work to threads // for n bins let's assign n / log(n) workers, assuming rows are evenly distributed. size_t num_rows = sframe_in.num_rows(); size_t num_workers = graphlab::thread::cpu_count(); size_t rows_per_worker = num_rows / num_workers; // prepare the out sframe std::vector<sframe> sframe_out; std::vector<sframe::iterator> sframe_out_iter; sframe_out.resize(n); for (auto& sf: sframe_out) { sf.open_for_write(sframe_in.column_names(), sframe_in.column_types(), "", 1); sframe_out_iter.push_back(sf.get_output_iterator(0)); } std::vector<std::unique_ptr<std::mutex>> sframe_out_locks; for (size_t i = 0; i < n; ++i) { sframe_out_locks.push_back(std::unique_ptr<std::mutex>(new std::mutex)); } auto reader = sframe_in.get_reader(); parallel_for(0, num_workers, [&](size_t worker_id) { size_t start_row = worker_id * rows_per_worker; size_t end_row = (worker_id == (num_workers-1)) ? num_rows : (worker_id + 1) * rows_per_worker; // prepare thread local output buffer for each sframe std::vector<buffered_writer<std::vector<flexible_type>, sframe::iterator>> writers; for (size_t i = 0; i < n; ++i) { writers.push_back( buffered_writer<std::vector<flexible_type>, sframe::iterator> (sframe_out_iter[i], *sframe_out_locks[i], WRITER_BUFFER_SOFT_LIMIT, WRITER_BUFFER_HARD_LIMIT) ); } std::vector<std::vector<flexible_type>> in_buffer(READER_BUFFER_SIZE); while (start_row < end_row) { // read a chunk of rows to shuffle size_t rows_to_read = std::min<size_t>((end_row - start_row), READER_BUFFER_SIZE); size_t rows_read = reader->read_rows(start_row, start_row + rows_to_read, in_buffer); DASSERT_EQ(rows_read, rows_to_read); start_row += rows_read; for (auto& row : in_buffer) { size_t out_index = hash_fn(row) % n; writers[out_index].write(row); } } // end of while // flush the rest of the buffer for (size_t i = 0; i < n; ++i) { writers[i].flush(); } }); // close all sframe writers for (auto& sf: sframe_out) { sf.close(); } return sframe_out; }
/** * Increments the value of a log entry */ inline void thr_inc_log_entry(size_t entry, double value) { event_log_thread_local_type* ev = get_thread_counter_ref(); DASSERT_LT(entry, MAX_LOG_SIZE); DASSERT_EQ(logs[entry]->is_callback_entry, false); ev->values[entry] += value; }