sframe join(sframe& sf_left, sframe& sf_right, std::string join_type, const std::map<std::string,std::string> join_columns, size_t max_buffer_size) { // ***SANITY CHECKS // check that each sframe is valid if(!sf_left.num_rows() || !sf_left.num_columns()) { log_and_throw("Current SFrame has nothing to join!"); } if(!sf_right.num_rows() || !sf_right.num_columns()) { log_and_throw("Given SFrame has nothing to join!"); } std::vector<size_t> left_join_positions; std::vector<size_t> right_join_positions; for(const auto &col_pair : join_columns) { // Check that all columns exist (in both sframes) // These will throw if not found left_join_positions.push_back(sf_left.column_index(col_pair.first)); right_join_positions.push_back(sf_right.column_index(col_pair.second)); // Each column must have matching types to compare effectively if(sf_left.column_type(left_join_positions.back()) != sf_right.column_type(right_join_positions.back())) { log_and_throw("Columns " + col_pair.first + " and " + col_pair.second + " does not have the same type in both SFrames."); } } // Figure out what join type we have to do boost::algorithm::to_lower(join_type); join_type_t in_join_type; if(join_type == "outer") { in_join_type = FULL_JOIN; } else if(join_type == "left") { in_join_type = LEFT_JOIN; } else if(join_type == "right") { in_join_type = RIGHT_JOIN; } else if(join_type == "inner") { in_join_type = INNER_JOIN; } else { log_and_throw("Invalid join type given!"); } // execute join (perhaps multiplex algorithm based on something?) join_impl::hash_join_executor join_executor(sf_left, sf_right, left_join_positions, right_join_positions, in_join_type, max_buffer_size); return join_executor.grace_hash_join(); }
std::vector<sframe> shuffle( sframe sframe_in, size_t n, std::function<size_t(const std::vector<flexible_type>&)> hash_fn) { ASSERT_GT(n, 0); // split the work to threads // for n bins let's assign n / log(n) workers, assuming rows are evenly distributed. size_t num_rows = sframe_in.num_rows(); size_t num_workers = graphlab::thread::cpu_count(); size_t rows_per_worker = num_rows / num_workers; // prepare the out sframe std::vector<sframe> sframe_out; std::vector<sframe::iterator> sframe_out_iter; sframe_out.resize(n); for (auto& sf: sframe_out) { sf.open_for_write(sframe_in.column_names(), sframe_in.column_types(), "", 1); sframe_out_iter.push_back(sf.get_output_iterator(0)); } std::vector<std::unique_ptr<std::mutex>> sframe_out_locks; for (size_t i = 0; i < n; ++i) { sframe_out_locks.push_back(std::unique_ptr<std::mutex>(new std::mutex)); } auto reader = sframe_in.get_reader(); parallel_for(0, num_workers, [&](size_t worker_id) { size_t start_row = worker_id * rows_per_worker; size_t end_row = (worker_id == (num_workers-1)) ? num_rows : (worker_id + 1) * rows_per_worker; // prepare thread local output buffer for each sframe std::vector<buffered_writer<std::vector<flexible_type>, sframe::iterator>> writers; for (size_t i = 0; i < n; ++i) { writers.push_back( buffered_writer<std::vector<flexible_type>, sframe::iterator> (sframe_out_iter[i], *sframe_out_locks[i], WRITER_BUFFER_SOFT_LIMIT, WRITER_BUFFER_HARD_LIMIT) ); } std::vector<std::vector<flexible_type>> in_buffer(READER_BUFFER_SIZE); while (start_row < end_row) { // read a chunk of rows to shuffle size_t rows_to_read = std::min<size_t>((end_row - start_row), READER_BUFFER_SIZE); size_t rows_read = reader->read_rows(start_row, start_row + rows_to_read, in_buffer); DASSERT_EQ(rows_read, rows_to_read); start_row += rows_read; for (auto& row : in_buffer) { size_t out_index = hash_fn(row) % n; writers[out_index].write(row); } } // end of while // flush the rest of the buffer for (size_t i = 0; i < n; ++i) { writers[i].flush(); } }); // close all sframe writers for (auto& sf: sframe_out) { sf.close(); } return sframe_out; }