Esempio n. 1
0
void unity_sgraph::fast_validate_add_edges(const sframe& edges,
    std::string src_field,
    std::string dst_field,
    size_t groupa, size_t groupb) const {

  if (!edges.contains_column(src_field)) {
    log_and_throw("Input sframe does not contain source id column: " + src_field);
  }
  if (!edges.contains_column(dst_field)) {
    log_and_throw("Input sframe does not contain target id column: " + dst_field);
  }

  flex_type_enum src_id_type = edges.column_type(edges.column_index(src_field));
  flex_type_enum dst_id_type = edges.column_type(edges.column_index(dst_field));

  if (src_id_type != dst_id_type) {
    std::string msg = "Source and target ids have different types: ";
    msg += std::string(flex_type_enum_to_name(src_id_type)) + " != " + flex_type_enum_to_name(dst_id_type);
    log_and_throw(msg);
  }

  if (src_id_type != flex_type_enum::INTEGER && src_id_type != flex_type_enum::STRING) {
    log_and_throw(
        std::string("Invalid id column type : ")
        + flex_type_enum_to_name(src_id_type)
        + ". Supported types are: integer and string."
    );
  }
}
Esempio n. 2
0
void unity_sgraph::fast_validate_add_vertices(const sframe& vertices,
    std::string id_field, size_t group) const {
  if (!vertices.contains_column(id_field)) {
    log_and_throw("Input sframe does not contain id column: " + id_field);
  }
  flex_type_enum id_type = vertices.column_type(vertices.column_index(id_field));

  if (id_type != flex_type_enum::INTEGER && id_type != flex_type_enum::STRING) {
    log_and_throw(
        std::string("Invalid id column type : ")
        + flex_type_enum_to_name(id_type) 
        + ". Supported types are: integer and string."
    );
  }
}
Esempio n. 3
0
sframe join(sframe& sf_left, 
            sframe& sf_right,
            std::string join_type,
            const std::map<std::string,std::string> join_columns,
            size_t max_buffer_size) {
  // ***SANITY CHECKS 

  // check that each sframe is valid
  if(!sf_left.num_rows() || !sf_left.num_columns()) {
    log_and_throw("Current SFrame has nothing to join!");
  }

  if(!sf_right.num_rows() || !sf_right.num_columns()) {
    log_and_throw("Given SFrame has nothing to join!");
  }

  std::vector<size_t> left_join_positions;
  std::vector<size_t> right_join_positions;
  for(const auto &col_pair : join_columns) {
    // Check that all columns exist (in both sframes)
    // These will throw if not found
    left_join_positions.push_back(sf_left.column_index(col_pair.first));
    right_join_positions.push_back(sf_right.column_index(col_pair.second));
    
    // Each column must have matching types to compare effectively
    if(sf_left.column_type(left_join_positions.back()) !=
        sf_right.column_type(right_join_positions.back())) {
      log_and_throw("Columns " + col_pair.first + " and " + col_pair.second + 
          " does not have the same type in both SFrames.");
    }
  }
  
  // Figure out what join type we have to do
  boost::algorithm::to_lower(join_type);

  join_type_t in_join_type;
  if(join_type == "outer") {
    in_join_type = FULL_JOIN;
  } else if(join_type == "left") {
    in_join_type = LEFT_JOIN;
  } else if(join_type == "right") {
    in_join_type = RIGHT_JOIN;
  } else if(join_type == "inner") {
    in_join_type = INNER_JOIN;
  } else {
    log_and_throw("Invalid join type given!");
  }

  // execute join (perhaps multiplex algorithm based on something?)
  join_impl::hash_join_executor join_executor(sf_left,
                                              sf_right,
                                              left_join_positions,
                                              right_join_positions,
                                              in_join_type,
                                              max_buffer_size);

  return join_executor.grace_hash_join();
}
Esempio n. 4
0
void sframe_reader::init(const sframe& frame, size_t num_segments) { 
  Dlog_func_entry();
  typedef sarray_reader<flexible_type> array_reader_type;
  ASSERT_MSG(!inited, "SFrame reader already inited");
  index_info = frame.get_index_info();
  // no columns. Just stop.
  if (index_info.column_names.size() == 0) {
    m_num_segments = 0;
    return;
  }
  if (num_segments == (size_t)(-1)) {
    // use the segmentation of the first column
    m_num_segments = frame.columns[0]->get_index_info().nsegments;
    std::vector<size_t> segment_sizes = frame.columns[0]->get_index_info().segment_sizes;
    for (size_t i = 0;i < index_info.column_names.size(); ++i) {
      column_data.emplace_back(std::move(frame.columns[i]->get_reader(segment_sizes)));
    }
  } else {
    // create num_segments worth of segments
    m_num_segments = num_segments;
    for (size_t i = 0;i < index_info.column_names.size(); ++i) {
      column_data.emplace_back(std::move(frame.columns[i]->get_reader(m_num_segments)));
    }
  }
}
Esempio n. 5
0
void subplan_executor::generate_to_sframe_segment(const std::shared_ptr<planner_node>& plan,
                                          sframe& out,
                                          size_t output_segment_id) {

  auto outiter = out.get_output_iterator(output_segment_id);

  generate_to_callback_function(
      plan, output_segment_id,
      [&](size_t segment_idx, const std::shared_ptr<sframe_rows>& rows) {
        (*outiter) = *rows;
        return false;
      });
}
Esempio n. 6
0
void sframe_reader::init(const sframe& frame, const std::vector<size_t>& segment_lengths) { 
  Dlog_func_entry();
  typedef sarray_reader<flexible_type> array_reader_type;
  ASSERT_MSG(!inited, "SFrame reader already inited");
  // Verify that lengths match up 
  index_info = frame.get_index_info();
  size_t sum = 0;
  for (size_t s: segment_lengths) sum += s;
  ASSERT_EQ(sum, size());

  m_num_segments = segment_lengths.size();
  for (size_t i = 0;i < index_info.column_names.size(); ++i) {
    column_data.emplace_back(std::move(frame.columns[i]->get_reader(segment_lengths)));
  }
}
Esempio n. 7
0
std::vector<sframe> shuffle(
    sframe sframe_in,
    size_t n,
    std::function<size_t(const std::vector<flexible_type>&)> hash_fn) {

    ASSERT_GT(n, 0);

    // split the work to threads
    // for n bins let's assign n / log(n) workers, assuming rows are evenly distributed.
    size_t num_rows = sframe_in.num_rows();
    size_t num_workers = graphlab::thread::cpu_count();
    size_t rows_per_worker = num_rows / num_workers;

    // prepare the out sframe
    std::vector<sframe> sframe_out;
    std::vector<sframe::iterator> sframe_out_iter;
    sframe_out.resize(n);
    for (auto& sf: sframe_out) {
      sf.open_for_write(sframe_in.column_names(), sframe_in.column_types(), "",  1);
      sframe_out_iter.push_back(sf.get_output_iterator(0));
    }
    std::vector<std::unique_ptr<std::mutex>> sframe_out_locks;
    for (size_t i = 0; i < n; ++i) {
      sframe_out_locks.push_back(std::unique_ptr<std::mutex>(new std::mutex));
    }

    auto reader = sframe_in.get_reader();
    parallel_for(0, num_workers, [&](size_t worker_id) {
        size_t start_row = worker_id * rows_per_worker;
        size_t end_row = (worker_id == (num_workers-1)) ? num_rows
                                                        : (worker_id + 1) * rows_per_worker;

        // prepare thread local output buffer for each sframe
        std::vector<buffered_writer<std::vector<flexible_type>, sframe::iterator>> writers;
        for (size_t i = 0; i < n; ++i) {
          writers.push_back(
            buffered_writer<std::vector<flexible_type>, sframe::iterator>
            (sframe_out_iter[i], *sframe_out_locks[i],
             WRITER_BUFFER_SOFT_LIMIT, WRITER_BUFFER_HARD_LIMIT)
          );
        }

        std::vector<std::vector<flexible_type>> in_buffer(READER_BUFFER_SIZE);
        while (start_row < end_row) {
          // read a chunk of rows to shuffle
          size_t rows_to_read = std::min<size_t>((end_row - start_row), READER_BUFFER_SIZE);
          size_t rows_read = reader->read_rows(start_row, start_row + rows_to_read, in_buffer);
          DASSERT_EQ(rows_read, rows_to_read);
          start_row += rows_read;

          for (auto& row : in_buffer) {
            size_t out_index = hash_fn(row) % n;
            writers[out_index].write(row);
          }
        } // end of while

        // flush the rest of the buffer
        for (size_t i = 0; i < n; ++i) {
          writers[i].flush();
        }
    });

    // close all sframe writers
    for (auto& sf: sframe_out) {
      sf.close();
    }
    return sframe_out;
}
sframe groupby_aggregate(const sframe& source,
      const std::vector<std::string>& keys,
      const std::vector<std::string>& output_column_names,
      const std::vector<std::pair<std::vector<std::string>,
                                  std::shared_ptr<group_aggregate_value>>>& groups,
      size_t max_buffer_size) {
  // first, sanity checks
  // check that group keys exist
  if (output_column_names.size() != groups.size()) {
    log_and_throw("There must be as many output columns as there are groups");
  }
  {
    // check that output column names are all unique, and do not intersect with
    // keys. Since empty values will be automatically assigned, we will skip
    // those.
    std::set<std::string> all_output_columns(keys.begin(), keys.end());
    size_t named_column_count = 0;
    for (auto s: output_column_names) {
      if (!s.empty()) {
        all_output_columns.insert(s);
        ++named_column_count;
      }
    }
    if (all_output_columns.size() != keys.size() + named_column_count) {
      log_and_throw("Output columns names are not unique");
    }
  }

  for (const auto& key: keys) {
    // check that the column name is valid
    if (!source.contains_column(key)) {
      log_and_throw("SFrame does not contain column " + key);
    }
  }

  // check that each group is valid
  for (const auto& group: groups) {
    // check that the column name is valid
    if (group.first.size() > 0) {
      for(size_t index = 0; index < group.first.size();index++) { 
        auto& col_name = group.first[index];
        if (!source.contains_column(col_name)) {
          log_and_throw("SFrame does not contain column " + col_name);
        }

        if(graphlab::registered_arg_functions.count(group.second->name()) != 0 && index > 0) 
          continue; 
        // check that the types are valid
        size_t column_number = source.column_index(col_name);
        if (!group.second->support_type(source.column_type(column_number))) {
          log_and_throw("Requested operation: " + group.second->name() +
                        " not supported on the type of column " + col_name);
        }
      }
    }
  }

  // key should not have repeated columns
  std::set<std::string> key_columns;
  std::set<std::string> group_columns;
  for (const auto& key: keys) key_columns.insert(key);
  for (const auto& group: groups) {
    for(auto& col_name : group.first) {
      group_columns.insert(col_name);
    }
  }
  if (key_columns.size() != keys.size()) {
      log_and_throw("Group by key cannot have repeated column names");
  }

  // ok. select out just the columns I care about
  // begin with the key columns
  std::vector<std::string> all_columns(key_columns.begin(), key_columns.end());
  // then all the group columns (as long as they are not also key columns)
  for (const auto& group_column: group_columns) {
    if (group_column != "" && key_columns.count(group_column) == 0) {
      all_columns.push_back(group_column);
    }
  }
  sframe frame_with_relevant_cols = source.select_columns(all_columns);

  // prepare the output frame
  sframe output;
  std::vector<std::string> column_names;
  std::vector<flex_type_enum> column_types;
  // output frame has the key column name and types
  for (const auto& key: key_columns) {
    column_names.push_back(key);
    column_types.push_back(source.column_type(source.column_index(key)));
  }

  // then for each group, make a unique name and determine the output group type
  for (size_t i = 0;i < groups.size(); ++i) {
    const auto& group = groups[i];
    std::string candidate_name = output_column_names[i];
    if (candidate_name.empty()) {
      std::string root_candidate_name;
      if(graphlab::registered_arg_functions.count(group.second->name()) == 0) { 
        for (auto& col_name: group.first) {
          if (root_candidate_name.empty()) {
            root_candidate_name += " of " + col_name;
          } else {
            root_candidate_name += "_" + col_name;
          }
        }  
        root_candidate_name = group.second->name() + root_candidate_name;
      } else {
        
        if(group.first.size() != 2) 
          log_and_throw("arg functions takes exactly two arguments");
        root_candidate_name += group.first[1] + " for " + group.second->name() + " of " + group.first[0];  
      }
      candidate_name = root_candidate_name;
      size_t ctr = 1;
      // keep trying to come up with a unique column name
      while (std::find(column_names.begin(),
                       column_names.end(),
                       candidate_name) != column_names.end()) {
        candidate_name = root_candidate_name + "." + std::to_string(ctr);
        ++ctr;
      }
    }
    column_names.push_back(candidate_name);

    std::vector<flex_type_enum> input_types;
    for(auto col_name : group.first) {
      input_types.push_back(source.column_type(source.column_index(col_name)));
    }
    // this statement is valid for argmax and argmin as well, because their 
    // set_input_types(...) simply return input_types. 
    auto output_type = group.second->set_input_types(input_types);
    column_types.push_back(output_type);
  }

  // done! now we can start on the groupby
  size_t nsegments = frame_with_relevant_cols.num_segments();
  // either nsegments, or n*log n buckets
  nsegments = std::max(nsegments,
                       thread::cpu_count() * std::max<size_t>(1, log2(thread::cpu_count())));

  output.open_for_write(column_names,
                        column_types,
                        "",
                        nsegments);


  groupby_aggregate_impl::group_aggregate_container
      container(max_buffer_size, nsegments);

  // ok the input sframe (frame_with_relevant_cols) contains all the values
  // we care about. However, the challenge here is to figure out how the keys
  // and values line up. By construction, all the key columns come first.
  // which is good. But group columns can be pretty much anywhere.
  size_t num_keys = keys.size();
  for (const auto& group: groups) {
    std::vector<size_t> column_numbers;
    for(auto& col_name : group.first) {
      column_numbers.push_back(frame_with_relevant_cols.column_index(col_name));
    }

    container.define_group(column_numbers, group.second);
  }
  // done. now we can begin parallel processing

  // shuffle the rows based on the value of the key column.
  auto input_reader = frame_with_relevant_cols.get_reader(thread::cpu_count());
  graphlab::timer ti;
  logstream(LOG_INFO) << "Filling group container: " << std::endl;
  parallel_for (0, input_reader->num_segments(),
                [&](size_t i) {
                  auto iter = input_reader->begin(i);
                  auto enditer = input_reader->end(i);
                  while(iter != enditer) {
                    auto& row = *iter;
                    container.add(row, num_keys);
                    ++iter;
                  }
                });

  logstream(LOG_INFO) << "Group container filled in " << ti.current_time() << std::endl;
  logstream(LOG_INFO) << "Writing output: " << std::endl;
  ti.start();
  container.group_and_write(output);
  logstream(LOG_INFO) << "Output written in: " << ti.current_time() << std::endl;
  output.close();
  return output;
}