Esempio n. 1
0
sframe join(sframe& sf_left, 
            sframe& sf_right,
            std::string join_type,
            const std::map<std::string,std::string> join_columns,
            size_t max_buffer_size) {
  // ***SANITY CHECKS 

  // check that each sframe is valid
  if(!sf_left.num_rows() || !sf_left.num_columns()) {
    log_and_throw("Current SFrame has nothing to join!");
  }

  if(!sf_right.num_rows() || !sf_right.num_columns()) {
    log_and_throw("Given SFrame has nothing to join!");
  }

  std::vector<size_t> left_join_positions;
  std::vector<size_t> right_join_positions;
  for(const auto &col_pair : join_columns) {
    // Check that all columns exist (in both sframes)
    // These will throw if not found
    left_join_positions.push_back(sf_left.column_index(col_pair.first));
    right_join_positions.push_back(sf_right.column_index(col_pair.second));
    
    // Each column must have matching types to compare effectively
    if(sf_left.column_type(left_join_positions.back()) !=
        sf_right.column_type(right_join_positions.back())) {
      log_and_throw("Columns " + col_pair.first + " and " + col_pair.second + 
          " does not have the same type in both SFrames.");
    }
  }
  
  // Figure out what join type we have to do
  boost::algorithm::to_lower(join_type);

  join_type_t in_join_type;
  if(join_type == "outer") {
    in_join_type = FULL_JOIN;
  } else if(join_type == "left") {
    in_join_type = LEFT_JOIN;
  } else if(join_type == "right") {
    in_join_type = RIGHT_JOIN;
  } else if(join_type == "inner") {
    in_join_type = INNER_JOIN;
  } else {
    log_and_throw("Invalid join type given!");
  }

  // execute join (perhaps multiplex algorithm based on something?)
  join_impl::hash_join_executor join_executor(sf_left,
                                              sf_right,
                                              left_join_positions,
                                              right_join_positions,
                                              in_join_type,
                                              max_buffer_size);

  return join_executor.grace_hash_join();
}
Esempio n. 2
0
std::vector<sframe> shuffle(
    sframe sframe_in,
    size_t n,
    std::function<size_t(const std::vector<flexible_type>&)> hash_fn) {

    ASSERT_GT(n, 0);

    // split the work to threads
    // for n bins let's assign n / log(n) workers, assuming rows are evenly distributed.
    size_t num_rows = sframe_in.num_rows();
    size_t num_workers = graphlab::thread::cpu_count();
    size_t rows_per_worker = num_rows / num_workers;

    // prepare the out sframe
    std::vector<sframe> sframe_out;
    std::vector<sframe::iterator> sframe_out_iter;
    sframe_out.resize(n);
    for (auto& sf: sframe_out) {
      sf.open_for_write(sframe_in.column_names(), sframe_in.column_types(), "",  1);
      sframe_out_iter.push_back(sf.get_output_iterator(0));
    }
    std::vector<std::unique_ptr<std::mutex>> sframe_out_locks;
    for (size_t i = 0; i < n; ++i) {
      sframe_out_locks.push_back(std::unique_ptr<std::mutex>(new std::mutex));
    }

    auto reader = sframe_in.get_reader();
    parallel_for(0, num_workers, [&](size_t worker_id) {
        size_t start_row = worker_id * rows_per_worker;
        size_t end_row = (worker_id == (num_workers-1)) ? num_rows
                                                        : (worker_id + 1) * rows_per_worker;

        // prepare thread local output buffer for each sframe
        std::vector<buffered_writer<std::vector<flexible_type>, sframe::iterator>> writers;
        for (size_t i = 0; i < n; ++i) {
          writers.push_back(
            buffered_writer<std::vector<flexible_type>, sframe::iterator>
            (sframe_out_iter[i], *sframe_out_locks[i],
             WRITER_BUFFER_SOFT_LIMIT, WRITER_BUFFER_HARD_LIMIT)
          );
        }

        std::vector<std::vector<flexible_type>> in_buffer(READER_BUFFER_SIZE);
        while (start_row < end_row) {
          // read a chunk of rows to shuffle
          size_t rows_to_read = std::min<size_t>((end_row - start_row), READER_BUFFER_SIZE);
          size_t rows_read = reader->read_rows(start_row, start_row + rows_to_read, in_buffer);
          DASSERT_EQ(rows_read, rows_to_read);
          start_row += rows_read;

          for (auto& row : in_buffer) {
            size_t out_index = hash_fn(row) % n;
            writers[out_index].write(row);
          }
        } // end of while

        // flush the rest of the buffer
        for (size_t i = 0; i < n; ++i) {
          writers[i].flush();
        }
    });

    // close all sframe writers
    for (auto& sf: sframe_out) {
      sf.close();
    }
    return sframe_out;
}