Ejemplo n.º 1
0
void unity_sgraph::fast_validate_add_edges(const sframe& edges,
    std::string src_field,
    std::string dst_field,
    size_t groupa, size_t groupb) const {

  if (!edges.contains_column(src_field)) {
    log_and_throw("Input sframe does not contain source id column: " + src_field);
  }
  if (!edges.contains_column(dst_field)) {
    log_and_throw("Input sframe does not contain target id column: " + dst_field);
  }

  flex_type_enum src_id_type = edges.column_type(edges.column_index(src_field));
  flex_type_enum dst_id_type = edges.column_type(edges.column_index(dst_field));

  if (src_id_type != dst_id_type) {
    std::string msg = "Source and target ids have different types: ";
    msg += std::string(flex_type_enum_to_name(src_id_type)) + " != " + flex_type_enum_to_name(dst_id_type);
    log_and_throw(msg);
  }

  if (src_id_type != flex_type_enum::INTEGER && src_id_type != flex_type_enum::STRING) {
    log_and_throw(
        std::string("Invalid id column type : ")
        + flex_type_enum_to_name(src_id_type)
        + ". Supported types are: integer and string."
    );
  }
}
void check_operation_feasibility(flex_type_enum left,
                                 flex_type_enum right,
                                 std::string op) {
  bool operation_is_feasible = false;

  if (left == flex_type_enum::VECTOR || right == flex_type_enum::VECTOR) {
    // special handling for vectors
    // we can perform every numeric op against numbers
    if (left == flex_type_enum::VECTOR || left == flex_type_enum::INTEGER || left == flex_type_enum::FLOAT) {
      if (right == flex_type_enum::VECTOR || right == flex_type_enum::INTEGER || right == flex_type_enum::FLOAT) {
          operation_is_feasible = true;
      }
    }
  } else if (op == "+" || op == "-" || op == "*" || op == "/") {
    operation_is_feasible = flex_type_has_binary_op(left, right, op[0]);
  } else if (op == "+" || op == "-" || op == "*" || op == "/" || op == "%") {
    operation_is_feasible = left == flex_type_enum::INTEGER &&
                            right == flex_type_enum::INTEGER;
  } else if (op == "<" || op == ">" || op == "<=" || op == ">=") {
    // the comparison operators are all compatible. we just need to check
    // the < operator
    operation_is_feasible = flex_type_has_binary_op(left, right, '<');
  } else if (op == "==" || op == "!=") {
    // equality comparison is always feasible
    operation_is_feasible = true;
  } else if (op == "&" || op == "|") {
    // boolean operations are always feasible
    operation_is_feasible = true;
  } else if (op == "in") {
    operation_is_feasible = left == flex_type_enum::STRING &&
                            right == flex_type_enum::STRING;
  } else {
    log_and_throw("Invalid scalar operation");
  }

  if (!operation_is_feasible) {
    throw std::string("Unsupported type operation. cannot perform operation ") +
          op + " between " + 
          flex_type_enum_to_name(left) + " and " +
          flex_type_enum_to_name(right);
  }
}
Ejemplo n.º 3
0
void unity_sgraph::fast_validate_add_vertices(const sframe& vertices,
    std::string id_field, size_t group) const {
  if (!vertices.contains_column(id_field)) {
    log_and_throw("Input sframe does not contain id column: " + id_field);
  }
  flex_type_enum id_type = vertices.column_type(vertices.column_index(id_field));

  if (id_type != flex_type_enum::INTEGER && id_type != flex_type_enum::STRING) {
    log_and_throw(
        std::string("Invalid id column type : ")
        + flex_type_enum_to_name(id_type) 
        + ". Supported types are: integer and string."
    );
  }
}
Ejemplo n.º 4
0
std::ostream& operator<<(std::ostream& out, const gl_sarray& other) {
  auto t = other.head(10);
  auto dtype = other.dtype();
  out << "dtype: " << flex_type_enum_to_name(dtype) << "\n";
  out << "Rows: " << other.size() << "\n";
  out << "[";
  bool first = true;
  for(auto i : t.range_iterator()) {
    if (!first) out << ",";
    if (dtype == flex_type_enum::STRING) out << "\"";
    if (i.get_type() == flex_type_enum::UNDEFINED) out << "None";
    else out << i;
    if (dtype == flex_type_enum::STRING) out << "\"";
    first = false;
  }
  out << "]" << "\n";
  return out;
}
Ejemplo n.º 5
0
  std::shared_ptr<sframe> sort(
    std::shared_ptr<lazy_sframe> sframe_ptr,
    const std::vector<std::string>& sort_column_names,
    const std::vector<bool>& sort_orders) {

    log_func_entry();

    // get sort column indexes from column names and also check column types
    std::vector<size_t> sort_column_indexes(sort_column_names.size());
    std::vector<flex_type_enum> supported_types =
        {flex_type_enum::STRING, flex_type_enum::INTEGER, flex_type_enum::FLOAT,flex_type_enum::DATETIME};
    std::set<flex_type_enum> supported_type_set(supported_types.begin(), supported_types.end());

    for(size_t i = 0; i < sort_column_names.size(); i++) {
      sort_column_indexes[i] = sframe_ptr->column_index(sort_column_names[i]);
      auto col_type = sframe_ptr->column_type(sort_column_indexes[i]);

      if (supported_type_set.count(col_type) == 0) {
        log_and_throw("Only column with type 'int', 'float', 'string', and 'datetime' can be sorted. Column '" +
            sort_column_names[i] + "'' is type: " + flex_type_enum_to_name(col_type));
      }
    }

    // Estimate the size of the sframe so that we could decide number of
    // chunks.  To account for strings, we estimate each cell is 64 bytes.
    // I'd love to estimate better.
    size_t estimated_sframe_size = sframe_num_cells(sframe_ptr) * 64.0;
    size_t num_partitions = std::ceil((1.0 * estimated_sframe_size) / sframe_config::SFRAME_SORT_BUFFER_SIZE);

    // Make partitions small enough for each thread to (theoretically) sort at once
    num_partitions = num_partitions * thread::cpu_count();

    // If we have more partitions than this, we could run into open file
    // descriptor limits
    num_partitions = std::min<size_t>(num_partitions, SFRAME_SORT_MAX_SEGMENTS);
    DASSERT_TRUE(num_partitions > 0);

    // Shortcut -- if only one partition, do a in memory sort and we are done
    if (num_partitions <= thread::cpu_count()) {
      logstream(LOG_INFO) << "Sorting SFrame in memory" << std::endl;
      return sframe_sort_impl::sort_sframe_in_memory(sframe_ptr, sort_column_indexes, sort_orders);
    }

    // This is a collection of partition keys sorted in the required order.
    // Each key is a flex_list value that contains the spliting value for
    // each sort column. Together they defines the "cut line" for all rows in
    // the SFrame.
    std::vector<flexible_type> partition_keys;


    // Do a quantile sketch on the sort columns to figure out the "splitting" points
    // for the SFrame
    timer ti;
    bool all_sorted = sframe_sort_impl::get_partition_keys(
      sframe_ptr->select_columns(sort_column_names),
      sort_orders, num_partitions, // in parameters
      partition_keys);  // out parameters
    logstream(LOG_INFO) << "Pivot estimation step: " << ti.current_time() << std::endl;

    // In rare case all values in the SFrame are the same, so no need to sort
    if (all_sorted) return sframe_ptr->get_sframe_ptr();

    // scatter partition the sframe into multiple chunks, chunks are relatively
    // sorted, but each chunk is not sorted. The sorting of each chunk is delayed
    // until it is consumed. Each chunk is stored as one segment for a sarray.
    // The chunk stores a serailized version of key and value
    std::vector<size_t> partition_sizes;

    // In the case where all sort keys in a given partition are the same, then
    // there is no need to sort the partition. This information is derived from
    // scattering
    std::vector<bool> partition_sorted(num_partitions, true);
    ti.start();
    auto partition_array = sframe_sort_impl::scatter_partition(
      sframe_ptr, sort_column_indexes, sort_orders, partition_keys, partition_sizes, partition_sorted);
    logstream(LOG_INFO) << "Scatter step: " << ti.current_time() << std::endl;

    // return a lazy sframe_ptr that would emit the sorted data lazily
    auto lazy_sort = std::make_shared<le_sort>(
      partition_array, partition_sorted, partition_sizes, sort_column_indexes,
      sort_orders, sframe_ptr->column_names(), sframe_ptr->column_types());

    return lazy_sort->eager_sort();
  }
Ejemplo n.º 6
0
gl_sarray gl_sarray::cumulative_aggregate(
     std::shared_ptr<group_aggregate_value> aggregator) const { 
  
  flex_type_enum input_type = this->dtype();
  flex_type_enum output_type = aggregator->set_input_types({input_type});
  if (! aggregator->support_type(input_type)) {
    std::stringstream ss;
    ss << "Cannot perform this operation on an SArray of type "
       << flex_type_enum_to_name(input_type) << "." << std::endl;
    log_and_throw(ss.str());
  } 

  // Empty case.  
  size_t m_size = this->size();
  if (m_size == 0) {
    return gl_sarray({}, output_type);
  }
  
  // Make a copy of an newly initialize aggregate for each thread.
  size_t n_threads = thread::cpu_count();
  gl_sarray_writer writer(output_type, n_threads);
  std::vector<std::shared_ptr<group_aggregate_value>> aggregators;
  for (size_t i = 0; i < n_threads; i++) {
      aggregators.push_back(
          std::shared_ptr<group_aggregate_value>(aggregator->new_instance()));
  } 

  // Skip Phases 1,2 when single threaded or more threads than rows.
  if ((n_threads > 1) && (m_size > n_threads)) {
    
    // Phase 1: Compute prefix-sums for each block.
    in_parallel([&](size_t thread_idx, size_t n_threads) {
      size_t start_row = thread_idx * m_size / n_threads; 
      size_t end_row = (thread_idx + 1) * m_size / n_threads;
      for (const auto& v: this->range_iterator(start_row, end_row)) {
        DASSERT_TRUE(thread_idx < aggregators.size());
        if (v != FLEX_UNDEFINED) {
          aggregators[thread_idx]->add_element_simple(v);
        }
      }
    });

    // Phase 2: Combine prefix-sum(s) at the end of each block.
    for (size_t i = n_threads - 1; i > 0; i--) {
      for (size_t j = 0; j < i; j++) {
        DASSERT_TRUE(i < aggregators.size());
        DASSERT_TRUE(j < aggregators.size());
        aggregators[i]->combine(*aggregators[j]);
      }
    }
  }
  
  // Phase 3: Reaggregate with an re-intialized prefix-sum from previous blocks. 
  auto reagg_fn = [&](size_t thread_idx, size_t n_threads) {
    flexible_type y = FLEX_UNDEFINED;
    size_t start_row = thread_idx * m_size / n_threads; 
    size_t end_row = (thread_idx + 1) * m_size / n_threads;
    std::shared_ptr<group_aggregate_value> re_aggregator (
                                              aggregator->new_instance());
  
    // Initialize with the merged value. 
    if (thread_idx >= 1) {
      DASSERT_TRUE(thread_idx - 1 < aggregators.size());
      y = aggregators[thread_idx - 1]->emit();
      re_aggregator->combine(*aggregators[thread_idx - 1]);
    }

    // Write prefix-sum
    for (const auto& v: this->range_iterator(start_row, end_row)) {
      if (v != FLEX_UNDEFINED) {
        re_aggregator->add_element_simple(v);
        y = re_aggregator->emit();
      }
      writer.write(y, thread_idx);
    }
  };
  
  // Run single threaded if more threads than rows. 
  if (m_size > n_threads) {
    in_parallel(reagg_fn);
  } else {
    reagg_fn(0, 1);   
  }
  return writer.close();
}