Ejemplo n.º 1
0
  inline void _debug_check_consistency(std::set<const node_info*>& seen) const {
    if(seen.count(this))
      return;
    seen.insert(this);

    DASSERT_TRUE(pnode->operator_type == type);
    DASSERT_EQ(pnode->inputs.size(), inputs.size());
    DASSERT_TRUE(is_source_node() || !inputs.empty());

    if(attributes.num_inputs != -1)
      DASSERT_EQ(inputs.size(), attributes.num_inputs);

    DASSERT_EQ(num_columns(), infer_planner_node_num_output_columns(pnode));

    {
      // Make sure that all inputs and outputs are consistent.
      std::map<const node_info*, size_t> input_counts;
      for(size_t i = 0; i < inputs.size(); ++i)
        input_counts[inputs[i].get()] += 1;

      for(size_t i = 0; i < inputs.size(); ++i) {

        DASSERT_TRUE(pnode->inputs[i] == inputs[i]->pnode);
        size_t n_present = 0;
        for(const cnode_info_ptr& out : inputs[i]->outputs) {

          if(out.get() == this)
            ++n_present;
        }

        DASSERT_EQ(n_present, input_counts.at(inputs[i].get()));
      }
    }

    {
      // Make sure that all inputs and outputs are consistent.
      std::map<const node_info*, size_t> output_counts;
      for(size_t i = 0; i < outputs.size(); ++i)
        output_counts[outputs[i].get()] += 1;

      for(size_t i = 0; i < outputs.size(); ++i) {
        size_t n_present = 0;
        for(const cnode_info_ptr& out : outputs[i]->inputs) {
          if(out.get() == this)
            ++n_present;
        }
        DASSERT_EQ(n_present, output_counts.at(outputs[i].get()));
      }

    }

    for(size_t i = 0; i < outputs.size(); ++i) {
      outputs[i]->_debug_check_consistency(seen);
    }

    for(size_t i = 0; i < inputs.size(); ++i) {
      inputs[i]->_debug_check_consistency(seen);
    }
  }
Ejemplo n.º 2
0
gl_sframe grouped_sframe::group_info() const {
  if (m_group_names.size() == 0) {
    log_and_throw("No groups present. Cannot obtain group info.");
  }
 
  // Return column names. 
  std::vector<std::string> ret_column_names = m_key_col_names;
  ret_column_names.push_back("group_size");
  DASSERT_EQ(ret_column_names.size(), m_key_col_names.size() + 1);

  // Return column types from the first group info. 
  DASSERT_TRUE(m_group_names.size() > 1);
  std::vector<flex_type_enum> ret_column_types;
  flexible_type first_key = m_group_names[0];
  flex_type_enum key_type = first_key.get_type();
  if (key_type == flex_type_enum::LIST) {
    for (size_t k = 0; k < first_key.size(); k++) {
      ret_column_types.push_back(first_key.array_at(k).get_type());
    }
  } else {
    ret_column_types.push_back(key_type);
  }
  ret_column_types.push_back(flex_type_enum::INTEGER);
  DASSERT_EQ(ret_column_types.size(), ret_column_names.size());
  
  // Prepare for writing.
  size_t num_segments = thread::cpu_count();
  gl_sframe_writer writer(ret_column_names, ret_column_types, num_segments);
  size_t range_dir_size = m_range_directory.size();

  // Write the group info.
  in_parallel([&](size_t thread_idx, size_t num_threads) {

    size_t start_idx = range_dir_size * thread_idx / num_threads;
    size_t end_idx = range_dir_size * (thread_idx + 1) / num_threads;

    for (size_t i = start_idx; i < end_idx; i++) { 
      size_t range_start = m_range_directory[i];
      size_t range_end = 0;
      if((i + 1) == m_range_directory.size()) {
        range_end = m_grouped_sf.size();
      } else {
        range_end = m_range_directory[i + 1];
      }
      size_t num_rows = range_end - range_start;
      std::vector<flexible_type> vals = m_group_names[i];
      vals.push_back(num_rows);
      DASSERT_EQ(vals.size(), ret_column_names.size());
      writer.write(vals, thread_idx);
    }
    return writer.close(); 
  });
}
Ejemplo n.º 3
0
void continuous::merge_results(std::vector<continuous_result>& thread_results) {
  for (auto& thread_result : thread_results) {
    flexible_type combined_min = std::min(m_transformer.min, thread_result.min);
    flexible_type combined_max = std::max(m_transformer.max, thread_result.max);
    m_transformer.min = combined_min;
    m_transformer.max = combined_max;
    m_transformer.rescale(combined_min, combined_max);
    thread_result.rescale(combined_min, combined_max);
    DASSERT_EQ(m_transformer.scale_min, thread_result.scale_min);
    DASSERT_EQ(m_transformer.scale_max, thread_result.scale_max);
    for (size_t i=0; i<continuous_result::MAX_BINS; i++) {
      m_transformer.bins[i] += thread_result.bins[i];
    }
  }
}
Ejemplo n.º 4
0
/**
 * Write column_names and column_values (as a row in the sframe) to JSONNode.
 */
void sframe_row_to_json(const std::vector<std::string>& column_names,
                        const std::vector<flexible_type>& column_values,
                        JSONNode& node) {
    DASSERT_EQ(column_names.size(), column_values.size());
    for (size_t i = 0; i < column_names.size(); ++i) {
        node.push_back(flexible_type_to_json(column_values[i], column_names[i]));
    }
}
Ejemplo n.º 5
0
 /**
  * Increments the value of a log entry
  */
 void thr_dec_log_entry(size_t entry, double value) {
   event_log_thread_local_type* ev = get_thread_counter_ref();
   DASSERT_LT(entry, MAX_LOG_SIZE);
   // does not work for cumulative logs
   DASSERT_NE((int)logs[entry]->logtype, (int) log_type::CUMULATIVE);
   DASSERT_EQ(logs[entry]->is_callback_entry, false);
   ev->values[entry] -= value;
 }
Ejemplo n.º 6
0
 void sort_and_write(SIterableType& out) {
   parallel_for (0, num_buckets(), [&](size_t i) { buckets[i]->flush();} );
   sarray_sink->close();
   typedef typename SIterableType::iterator OutIterator;
   DASSERT_EQ(out.num_segments(), buckets.size());
   parallel_for(0, buckets.size(),
                [&](size_t i) {
                  buckets[i]->template sort_and_write<OutIterator>(out.get_output_iterator(i));
                });
   out.close();
 };
Ejemplo n.º 7
0
/** 
 * Load the current block
 */
void parallel_sframe_iterator::load_current_block() {
  DASSERT_EQ(current_idx, block_end_idx);
    
  block_start_idx = current_idx;
  block_end_idx = std::min(end_idx, block_end_idx + max_block_size);

  if(block_start_idx == block_end_idx) {
    for(size_t i = 0; i < buffers.size(); ++i)
      buffers[i].clear();
  }
    
  for(size_t i = 0; i < sources.size(); ++i) {
    sources[i]->read_rows(block_start_idx, block_end_idx, buffers[i]);
  }
}
Ejemplo n.º 8
0
std::vector<sframe> shuffle(
    sframe sframe_in,
    size_t n,
    std::function<size_t(const std::vector<flexible_type>&)> hash_fn) {

    ASSERT_GT(n, 0);

    // split the work to threads
    // for n bins let's assign n / log(n) workers, assuming rows are evenly distributed.
    size_t num_rows = sframe_in.num_rows();
    size_t num_workers = graphlab::thread::cpu_count();
    size_t rows_per_worker = num_rows / num_workers;

    // prepare the out sframe
    std::vector<sframe> sframe_out;
    std::vector<sframe::iterator> sframe_out_iter;
    sframe_out.resize(n);
    for (auto& sf: sframe_out) {
      sf.open_for_write(sframe_in.column_names(), sframe_in.column_types(), "",  1);
      sframe_out_iter.push_back(sf.get_output_iterator(0));
    }
    std::vector<std::unique_ptr<std::mutex>> sframe_out_locks;
    for (size_t i = 0; i < n; ++i) {
      sframe_out_locks.push_back(std::unique_ptr<std::mutex>(new std::mutex));
    }

    auto reader = sframe_in.get_reader();
    parallel_for(0, num_workers, [&](size_t worker_id) {
        size_t start_row = worker_id * rows_per_worker;
        size_t end_row = (worker_id == (num_workers-1)) ? num_rows
                                                        : (worker_id + 1) * rows_per_worker;

        // prepare thread local output buffer for each sframe
        std::vector<buffered_writer<std::vector<flexible_type>, sframe::iterator>> writers;
        for (size_t i = 0; i < n; ++i) {
          writers.push_back(
            buffered_writer<std::vector<flexible_type>, sframe::iterator>
            (sframe_out_iter[i], *sframe_out_locks[i],
             WRITER_BUFFER_SOFT_LIMIT, WRITER_BUFFER_HARD_LIMIT)
          );
        }

        std::vector<std::vector<flexible_type>> in_buffer(READER_BUFFER_SIZE);
        while (start_row < end_row) {
          // read a chunk of rows to shuffle
          size_t rows_to_read = std::min<size_t>((end_row - start_row), READER_BUFFER_SIZE);
          size_t rows_read = reader->read_rows(start_row, start_row + rows_to_read, in_buffer);
          DASSERT_EQ(rows_read, rows_to_read);
          start_row += rows_read;

          for (auto& row : in_buffer) {
            size_t out_index = hash_fn(row) % n;
            writers[out_index].write(row);
          }
        } // end of while

        // flush the rest of the buffer
        for (size_t i = 0; i < n; ++i) {
          writers[i].flush();
        }
    });

    // close all sframe writers
    for (auto& sf: sframe_out) {
      sf.close();
    }
    return sframe_out;
}
Ejemplo n.º 9
0
 /**
  * Increments the value of a log entry
  */
 inline void thr_inc_log_entry(size_t entry, double value) {
   event_log_thread_local_type* ev = get_thread_counter_ref();
   DASSERT_LT(entry, MAX_LOG_SIZE);
   DASSERT_EQ(logs[entry]->is_callback_entry, false);
   ev->values[entry] += value;
 }