Ejemplo n.º 1
0
  inline void _debug_check_consistency(std::set<const node_info*>& seen) const {
    if(seen.count(this))
      return;
    seen.insert(this);

    DASSERT_TRUE(pnode->operator_type == type);
    DASSERT_EQ(pnode->inputs.size(), inputs.size());
    DASSERT_TRUE(is_source_node() || !inputs.empty());

    if(attributes.num_inputs != -1)
      DASSERT_EQ(inputs.size(), attributes.num_inputs);

    DASSERT_EQ(num_columns(), infer_planner_node_num_output_columns(pnode));

    {
      // Make sure that all inputs and outputs are consistent.
      std::map<const node_info*, size_t> input_counts;
      for(size_t i = 0; i < inputs.size(); ++i)
        input_counts[inputs[i].get()] += 1;

      for(size_t i = 0; i < inputs.size(); ++i) {

        DASSERT_TRUE(pnode->inputs[i] == inputs[i]->pnode);
        size_t n_present = 0;
        for(const cnode_info_ptr& out : inputs[i]->outputs) {

          if(out.get() == this)
            ++n_present;
        }

        DASSERT_EQ(n_present, input_counts.at(inputs[i].get()));
      }
    }

    {
      // Make sure that all inputs and outputs are consistent.
      std::map<const node_info*, size_t> output_counts;
      for(size_t i = 0; i < outputs.size(); ++i)
        output_counts[outputs[i].get()] += 1;

      for(size_t i = 0; i < outputs.size(); ++i) {
        size_t n_present = 0;
        for(const cnode_info_ptr& out : outputs[i]->inputs) {
          if(out.get() == this)
            ++n_present;
        }
        DASSERT_EQ(n_present, output_counts.at(outputs[i].get()));
      }

    }

    for(size_t i = 0; i < outputs.size(); ++i) {
      outputs[i]->_debug_check_consistency(seen);
    }

    for(size_t i = 0; i < inputs.size(); ++i) {
      inputs[i]->_debug_check_consistency(seen);
    }
  }
Ejemplo n.º 2
0
/**
 * Utility function to throw an error if a vector is of unequal length.
 * \param[in] gl_sarray of type vector 
 */
void check_vector_equal_size(const gl_sarray& in) {
  // Initialize. 
  DASSERT_TRUE(in.dtype() == flex_type_enum::VECTOR); 
  size_t n_threads = thread::cpu_count();
  n_threads = std::max(n_threads, size_t(1));
  size_t m_size = in.size();
          
  // Throw the following error. 
  auto throw_error = [] (size_t row_number, size_t expected, size_t current) {
    std::stringstream ss;
    ss << "Vectors must be of the same size. Row " << row_number 
       << " contains a vector of size " << current << ". Expected a vector of"
       << " size " << expected << "." << std::endl;
    log_and_throw(ss.str());
  };
  
  // Within each block of the SArray, check that the vectors have the same size.
  std::vector<size_t> expected_sizes (n_threads, size_t(-1));
  in_parallel([&](size_t thread_idx, size_t n_threads) {
    size_t start_row = thread_idx * m_size / n_threads; 
    size_t end_row = (thread_idx + 1) * m_size / n_threads;
    size_t expected_size = size_t(-1);
    size_t row_number = start_row;
    for (const auto& v: in.range_iterator(start_row, end_row)) {
      if (v != FLEX_UNDEFINED) {
        if (expected_size == size_t(-1)) {
          expected_size = v.size();
          expected_sizes[thread_idx] = expected_size; 
        } else {
          DASSERT_TRUE(v.get_type() == flex_type_enum::VECTOR);
          if (expected_size != v.size()) {
            throw_error(row_number, expected_size, v.size());
          }
        }
      }
      row_number++;
    }
  });

  // Make sure sizes accross blocks are also the same. 
  size_t vector_size = size_t(-1);
  for (size_t thread_idx = 0; thread_idx < n_threads; thread_idx++) {
    // If this block contains all None values, skip it.
    if (expected_sizes[thread_idx] != size_t(-1)) {

      if (vector_size == size_t(-1)) {
          vector_size = expected_sizes[thread_idx]; 
      } else {
         if (expected_sizes[thread_idx] != vector_size) {
           throw_error(thread_idx * m_size / n_threads, 
                              vector_size, expected_sizes[thread_idx]);
         } 
      }
    }
  }
}
Ejemplo n.º 3
0
 select_edge_fields_op(const std::vector<std::string>& _fields, size_t groupa, size_t groupb) :
     groupa(groupa), groupb(groupb) {
     std::set<std::string> unique_fields;
     for (const auto& f: _fields) {
         if (!unique_fields.count(f)) {
             fields.push_back(f);
             unique_fields.insert(unique_fields.end(), f);
         }
     }
     DASSERT_TRUE(unique_fields.count(sgraph::SRC_COLUMN_NAME) > 0);
     DASSERT_TRUE(unique_fields.count(sgraph::DST_COLUMN_NAME) > 0);
 }
Ejemplo n.º 4
0
void pysgraph_synchronize::load_vertex_partition(size_t partition_id, std::vector<sgraph_vertex_data>& vertices) {
  DASSERT_LT(partition_id, m_num_partitions);
  DASSERT_FALSE(m_is_partition_loaded[partition_id]);
  m_vertex_partitions[partition_id] = std::move(vertices);
  m_is_partition_loaded[partition_id] = true;
  DASSERT_TRUE(is_loaded(partition_id));
}
Ejemplo n.º 5
0
    /// Acquires a lock on the mutex
    inline void lock() const {
      int error = pthread_mutex_lock( &m_mut  );
      DASSERT_MSG(!error, "Mutex lock error %d", error);
#ifdef _WIN32
      DASSERT_TRUE(!locked);
      locked = true;
#endif
    }
Ejemplo n.º 6
0
 select_vertex_fields_op(const std::vector<std::string>& _fields, size_t group) :
     group(group) {
     std::set<std::string> unique_fields;
     for (const auto& f: _fields) {
         if (!unique_fields.count(f)) {
             fields.push_back(f);
             unique_fields.insert(unique_fields.end(), f);
         }
     }
     DASSERT_TRUE(unique_fields.count(sgraph::VID_COLUMN_NAME) > 0);
 }
Ejemplo n.º 7
0
void pysgraph_synchronize::update_vertex_partition(vertex_partition_exchange& vpartition_exchange) {
  DASSERT_TRUE(m_is_partition_loaded[vpartition_exchange.partition_id]);

  auto& vertex_partition = m_vertex_partitions[vpartition_exchange.partition_id];
  auto& fields_ids = vpartition_exchange.field_ids;
  for (auto& vid_data_pair : vpartition_exchange.vertices) {
    size_t id = vid_data_pair.first;
    sgraph_vertex_data& vdata = vid_data_pair.second;
    for (size_t i = 0; i < fields_ids.size(); ++i)
      vertex_partition[id][fields_ids[i]] = vdata[i];
  }
}
Ejemplo n.º 8
0
gl_sframe grouped_sframe::group_info() const {
  if (m_group_names.size() == 0) {
    log_and_throw("No groups present. Cannot obtain group info.");
  }
 
  // Return column names. 
  std::vector<std::string> ret_column_names = m_key_col_names;
  ret_column_names.push_back("group_size");
  DASSERT_EQ(ret_column_names.size(), m_key_col_names.size() + 1);

  // Return column types from the first group info. 
  DASSERT_TRUE(m_group_names.size() > 1);
  std::vector<flex_type_enum> ret_column_types;
  flexible_type first_key = m_group_names[0];
  flex_type_enum key_type = first_key.get_type();
  if (key_type == flex_type_enum::LIST) {
    for (size_t k = 0; k < first_key.size(); k++) {
      ret_column_types.push_back(first_key.array_at(k).get_type());
    }
  } else {
    ret_column_types.push_back(key_type);
  }
  ret_column_types.push_back(flex_type_enum::INTEGER);
  DASSERT_EQ(ret_column_types.size(), ret_column_names.size());
  
  // Prepare for writing.
  size_t num_segments = thread::cpu_count();
  gl_sframe_writer writer(ret_column_names, ret_column_types, num_segments);
  size_t range_dir_size = m_range_directory.size();

  // Write the group info.
  in_parallel([&](size_t thread_idx, size_t num_threads) {

    size_t start_idx = range_dir_size * thread_idx / num_threads;
    size_t end_idx = range_dir_size * (thread_idx + 1) / num_threads;

    for (size_t i = start_idx; i < end_idx; i++) { 
      size_t range_start = m_range_directory[i];
      size_t range_end = 0;
      if((i + 1) == m_range_directory.size()) {
        range_end = m_grouped_sf.size();
      } else {
        range_end = m_range_directory[i + 1];
      }
      size_t num_rows = range_end - range_start;
      std::vector<flexible_type> vals = m_group_names[i];
      vals.push_back(num_rows);
      DASSERT_EQ(vals.size(), ret_column_names.size());
      writer.write(vals, thread_idx);
    }
    return writer.close(); 
  });
}
Ejemplo n.º 9
0
std::vector<sgraph_edge_data> graph_pylambda_evaluator::eval_triple_apply(
    const std::vector<sgraph_edge_data>& all_edge_data,
    size_t src_partition, size_t dst_partition,
    const std::vector<size_t>& mutated_edge_field_ids) {

  std::lock_guard<mutex> lg(m_mutex);
  
  logstream(LOG_INFO) << "graph_lambda_worker eval triple apply " << src_partition 
                      << ", " << dst_partition << std::endl;
  
  DASSERT_TRUE(is_loaded(src_partition));
  DASSERT_TRUE(is_loaded(dst_partition));

  auto& source_partition = m_graph_sync.get_partition(src_partition);
  auto& target_partition = m_graph_sync.get_partition(dst_partition);

  std::vector<std::string> mutated_edge_keys;
  for (size_t fid: mutated_edge_field_ids) {
    mutated_edge_keys.push_back(m_edge_keys[fid]);
  }

  std::vector<sgraph_edge_data> ret(all_edge_data.size());

  lambda_graph_triple_apply_data lgt;

  lgt.all_edge_data = &all_edge_data;
  lgt.out_edge_data = &ret;
  lgt.source_partition = &source_partition;
  lgt.target_partition = &target_partition;
  lgt.vertex_keys = &m_vertex_keys;
  lgt.edge_keys = &m_edge_keys;
  lgt.mutated_edge_keys = &mutated_edge_keys;
  lgt.srcid_column = m_srcid_column;
  lgt.dstid_column = m_dstid_column;

  evaluation_functions.eval_graph_triple_apply(m_lambda_id, &lgt);
  python::check_for_python_exception();
  
  return ret;
}
Ejemplo n.º 10
0
/** Turns a node graph into one with all the source nodes segmented.
 *  Used to run a section in parallel.
 */
pnode_ptr make_segmented_graph(pnode_ptr n, size_t segment_idx, 
    size_t num_segments, std::map<pnode_ptr, pnode_ptr>& memo) {
  if (memo.count(n)) return memo[n];
  if(num_segments == 0) {
    memo[n] = n;
    return n;
  }

  pnode_ptr ret(new planner_node(*n));

  if(is_source_node(n)) {

    // First, if it's a source node, then it should have begin_index,
    // and end_index in the operarator_parameters.

    DASSERT_TRUE(n->operator_parameters.count("begin_index"));
    DASSERT_TRUE(n->operator_parameters.count("end_index"));

    size_t old_begin_index = n->operator_parameters.at("begin_index");
    size_t old_end_index = n->operator_parameters.at("end_index");

    size_t old_length = old_end_index - old_begin_index;

    size_t new_begin_index = old_begin_index + (segment_idx * old_length) / num_segments;
    size_t new_end_index = old_begin_index + ((segment_idx + 1) * old_length) / num_segments;

    DASSERT_LE(old_begin_index, new_begin_index);
    DASSERT_LE(new_end_index, old_end_index);

    ret->operator_parameters["begin_index"] = new_begin_index;
    ret->operator_parameters["end_index"] = new_end_index;

  } else {
    for(size_t i = 0; i < ret->inputs.size(); ++i) {
      ret->inputs[i] = make_segmented_graph(ret->inputs[i], segment_idx, num_segments, memo);
    }
  }
  memo[n] = ret;
  return ret;
}
Ejemplo n.º 11
0
size_t block_writer::write_block(size_t segment_id,
                                 size_t column_id, 
                                 char* data,
                                 block_info block) {
  DASSERT_LT(segment_id, m_index_info.nsegments);
  DASSERT_LT(column_id, m_index_info.columns.size());
  DASSERT_TRUE(m_output_files[segment_id] != nullptr);
  // try to compress the data
  size_t compress_bound = LZ4_compressBound(block.block_size);
  auto compression_buffer = m_buffer_pool.get_new_buffer();
  compression_buffer->resize(compress_bound);
  char* cbuffer = compression_buffer->data();
  size_t clen = compress_bound;
  clen = LZ4_compress(data, cbuffer, block.block_size);

  char* buffer_to_write = NULL;
  size_t buffer_to_write_len = 0;
  if (clen < COMPRESSION_DISABLE_THRESHOLD * block.block_size) {
    // compression has a benefit!
    block.flags |= LZ4_COMPRESSION;
    block.length = clen;
    buffer_to_write = cbuffer;
    buffer_to_write_len = clen;
  } else {
    // compression has no benefit! do not compress!
    // unset LZ4
    block.flags &= (~(size_t)LZ4_COMPRESSION);
    block.length = block.block_size;
    buffer_to_write = data;
    buffer_to_write_len = block.block_size;
  }

  size_t padding = ((buffer_to_write_len + 4095) / 4096) * 4096 - buffer_to_write_len;
  ASSERT_LT(padding, 4096);
  // write!
  m_output_file_locks[segment_id].lock();
  block.offset = m_output_bytes_written[segment_id];
  m_output_bytes_written[segment_id] += buffer_to_write_len + padding;
  m_index_info.columns[column_id].segment_sizes[segment_id] += block.num_elem;
  m_output_files[segment_id]->write(buffer_to_write, buffer_to_write_len);
  m_output_files[segment_id]->write(padding_bytes, padding);
  m_blocks[segment_id][column_id].push_back(block);
  m_output_file_locks[segment_id].unlock();

  m_buffer_pool.release_buffer(std::move(compression_buffer));

  if (!m_output_files[segment_id]->good()) {
    log_and_throw_io_failure("Fail to write. Disk may be full.");
  }
  return buffer_to_write_len;
}
Ejemplo n.º 12
0
 inline vertex_partition_exchange get_vertex_partition_exchange(size_t partition_id, const std::unordered_set<size_t>& vertex_ids, const std::vector<size_t>& field_ids) {
   DASSERT_TRUE(m_is_partition_loaded[partition_id]);
   vertex_partition_exchange ret;
   ret.partition_id = partition_id;
   ret.field_ids = field_ids;
   auto& vertex_partition = *(m_vertex_partitions[partition_id]);
   for (size_t vid:  vertex_ids) {
     auto& vdata = vertex_partition[vid];
     sgraph_vertex_data vdata_subset;
     for (auto fid: field_ids)  vdata_subset.push_back(vdata[fid]);
     ret.vertices.push_back({vid, std::move(vdata_subset)});
   }
   return ret;
 }
Ejemplo n.º 13
0
  inline void update_vertex_partition(vertex_partition_exchange& vpartition_exchange) {
    DASSERT_TRUE(m_is_partition_loaded[vpartition_exchange.partition_id]);

    auto& vertex_partition = *(m_vertex_partitions[vpartition_exchange.partition_id]);
    auto& update_field_index = vpartition_exchange.field_ids; 

    for (auto& vid_data_pair : vpartition_exchange.vertices) {
      size_t id = vid_data_pair.first;
      sgraph_vertex_data& vdata = vid_data_pair.second;
      for (size_t i = 0; i < update_field_index.size(); ++i) {
        size_t fid = vpartition_exchange.field_ids[i];
        vertex_partition[id][fid] = vdata[i];
      }
    }
  }
Ejemplo n.º 14
0
gl_gframe::gl_gframe(gl_sgraph* g, gframe_type_enum t) :
  m_sgraph(g), m_gframe_type(t) {
  DASSERT_TRUE(m_sgraph != NULL);
}
Ejemplo n.º 15
0
 /// Releases a lock on the mutex
 inline void unlock() const {
   int error = pthread_mutex_unlock( &m_mut );
   DASSERT_TRUE(!error);
 }
Ejemplo n.º 16
0
 /// Acquires a lock on the mutex
 inline void lock() const {
   int error = pthread_mutex_lock( &m_mut  );
   // if (error) std::cerr << "mutex.lock() error: " << error << std::endl;
   DASSERT_TRUE(!error);
 }
Ejemplo n.º 17
0
 ~recursive_mutex(){
   int error = pthread_mutex_destroy( &m_mut );
   DASSERT_TRUE(!error);
 }
Ejemplo n.º 18
0
GL_HOT_NOINLINE_FLATTEN 
void __run_top_k_small_k(std::vector<T>& v, LessThan less_than, size_t k) {

  std::sort(v.begin(), v.begin() + k, less_than);
  
  for(size_t i = k; i < v.size(); ++i) {
    if(less_than(v[0], v[i])) {
      
#ifndef NDEBUG
      // Preserve all the elements so the debug routines below can check things. 
      std::swap(v[0], v[i]);
#else
      // Just do an assignment.
      v[0] = v[i];
#endif

      for(size_t j = 1; j < k; ++j) { 
        if(!less_than(v[j-1], v[j])) {
          std::swap(v[j], v[j-1]);
        } else {
          break;
        }
      }
    }
  }

#ifndef NDEBUG

  // Run checking code here to make sure this is equivalent to
  // nth_element + sort.
  std::vector<T> va;
  va.assign(v.begin(), v.begin() + k); 
  
  auto gt_sorter = [&](const T& t1, const T& t2) {
    return less_than(t2, t1);
  }; 

  std::nth_element(v.begin(), v.begin() + k, v.end(), gt_sorter);

  std::sort(v.begin(), v.begin() + k, gt_sorter);
  for(size_t j = 0; j < k; ++j) {
    // test for equality using the less_than operator
    ASSERT_TRUE(!less_than(v[j], va[k - 1 - j]) && !less_than(va[k - 1 - j], v[j]));
  }

  for(size_t i = k; i < v.size(); ++i) {
    for(size_t j = 0; j < k; ++j) { 
      ASSERT_TRUE(bool(!less_than(v[j], v[i])));
    }
  }

  // Copy them back in sorted decreasing order.
  for(size_t i = 0; i < k; ++i) {
    v[k - 1 - i] = va[i];
  }
#else
  std::reverse(v.begin(), v.begin() + k); 
#endif
  
  DASSERT_TRUE(bool(std::is_sorted(v.begin(), v.begin() + k, gt_sorter)));
  
  v.resize(k);
} 
Ejemplo n.º 19
0
 /**
  * Sets the input types and returns the output type. For instance,
  * a sum aggregator when summing integers will return an integer, and when
  * summing doubles will return doubles.
  *
  * Default implementation assumes there is ony one input, and output
  * type is the same as input type.
  */
 virtual flex_type_enum set_input_types(const std::vector<flex_type_enum>& types) {
   DASSERT_TRUE(types.size() == 1);
   return set_input_type(types[0]);
 }
Ejemplo n.º 20
0
 /**
  * Adds an element to the aggregate. Elements to be added will be either
  * the input_type (as set by set_input_type()) or UNDEFINED.
  *
  * Operator that expects more than one input values need to overwrite this function
  */
 virtual void add_element(const std::vector<flexible_type>& values) {
   DASSERT_TRUE(values.size() == 1);
   add_element_simple(values[0]);
 }
Ejemplo n.º 21
0
  std::shared_ptr<sframe> sort(
    std::shared_ptr<lazy_sframe> sframe_ptr,
    const std::vector<std::string>& sort_column_names,
    const std::vector<bool>& sort_orders) {

    log_func_entry();

    // get sort column indexes from column names and also check column types
    std::vector<size_t> sort_column_indexes(sort_column_names.size());
    std::vector<flex_type_enum> supported_types =
        {flex_type_enum::STRING, flex_type_enum::INTEGER, flex_type_enum::FLOAT,flex_type_enum::DATETIME};
    std::set<flex_type_enum> supported_type_set(supported_types.begin(), supported_types.end());

    for(size_t i = 0; i < sort_column_names.size(); i++) {
      sort_column_indexes[i] = sframe_ptr->column_index(sort_column_names[i]);
      auto col_type = sframe_ptr->column_type(sort_column_indexes[i]);

      if (supported_type_set.count(col_type) == 0) {
        log_and_throw("Only column with type 'int', 'float', 'string', and 'datetime' can be sorted. Column '" +
            sort_column_names[i] + "'' is type: " + flex_type_enum_to_name(col_type));
      }
    }

    // Estimate the size of the sframe so that we could decide number of
    // chunks.  To account for strings, we estimate each cell is 64 bytes.
    // I'd love to estimate better.
    size_t estimated_sframe_size = sframe_num_cells(sframe_ptr) * 64.0;
    size_t num_partitions = std::ceil((1.0 * estimated_sframe_size) / sframe_config::SFRAME_SORT_BUFFER_SIZE);

    // Make partitions small enough for each thread to (theoretically) sort at once
    num_partitions = num_partitions * thread::cpu_count();

    // If we have more partitions than this, we could run into open file
    // descriptor limits
    num_partitions = std::min<size_t>(num_partitions, SFRAME_SORT_MAX_SEGMENTS);
    DASSERT_TRUE(num_partitions > 0);

    // Shortcut -- if only one partition, do a in memory sort and we are done
    if (num_partitions <= thread::cpu_count()) {
      logstream(LOG_INFO) << "Sorting SFrame in memory" << std::endl;
      return sframe_sort_impl::sort_sframe_in_memory(sframe_ptr, sort_column_indexes, sort_orders);
    }

    // This is a collection of partition keys sorted in the required order.
    // Each key is a flex_list value that contains the spliting value for
    // each sort column. Together they defines the "cut line" for all rows in
    // the SFrame.
    std::vector<flexible_type> partition_keys;


    // Do a quantile sketch on the sort columns to figure out the "splitting" points
    // for the SFrame
    timer ti;
    bool all_sorted = sframe_sort_impl::get_partition_keys(
      sframe_ptr->select_columns(sort_column_names),
      sort_orders, num_partitions, // in parameters
      partition_keys);  // out parameters
    logstream(LOG_INFO) << "Pivot estimation step: " << ti.current_time() << std::endl;

    // In rare case all values in the SFrame are the same, so no need to sort
    if (all_sorted) return sframe_ptr->get_sframe_ptr();

    // scatter partition the sframe into multiple chunks, chunks are relatively
    // sorted, but each chunk is not sorted. The sorting of each chunk is delayed
    // until it is consumed. Each chunk is stored as one segment for a sarray.
    // The chunk stores a serailized version of key and value
    std::vector<size_t> partition_sizes;

    // In the case where all sort keys in a given partition are the same, then
    // there is no need to sort the partition. This information is derived from
    // scattering
    std::vector<bool> partition_sorted(num_partitions, true);
    ti.start();
    auto partition_array = sframe_sort_impl::scatter_partition(
      sframe_ptr, sort_column_indexes, sort_orders, partition_keys, partition_sizes, partition_sorted);
    logstream(LOG_INFO) << "Scatter step: " << ti.current_time() << std::endl;

    // return a lazy sframe_ptr that would emit the sorted data lazily
    auto lazy_sort = std::make_shared<le_sort>(
      partition_array, partition_sorted, partition_sizes, sort_column_indexes,
      sort_orders, sframe_ptr->column_names(), sframe_ptr->column_types());

    return lazy_sort->eager_sort();
  }
Ejemplo n.º 22
0
 inline std::vector<sgraph_vertex_data>& get_partition(size_t partition_id) {
   DASSERT_LT(partition_id, m_num_partitions);
   DASSERT_TRUE(is_loaded(partition_id));
   return m_vertex_partitions[partition_id];
 }
 const T& value() const { DASSERT_TRUE(false); return T(); }
Ejemplo n.º 24
0
gl_sarray gl_sarray::cumulative_aggregate(
     std::shared_ptr<group_aggregate_value> aggregator) const { 
  
  flex_type_enum input_type = this->dtype();
  flex_type_enum output_type = aggregator->set_input_types({input_type});
  if (! aggregator->support_type(input_type)) {
    std::stringstream ss;
    ss << "Cannot perform this operation on an SArray of type "
       << flex_type_enum_to_name(input_type) << "." << std::endl;
    log_and_throw(ss.str());
  } 

  // Empty case.  
  size_t m_size = this->size();
  if (m_size == 0) {
    return gl_sarray({}, output_type);
  }
  
  // Make a copy of an newly initialize aggregate for each thread.
  size_t n_threads = thread::cpu_count();
  gl_sarray_writer writer(output_type, n_threads);
  std::vector<std::shared_ptr<group_aggregate_value>> aggregators;
  for (size_t i = 0; i < n_threads; i++) {
      aggregators.push_back(
          std::shared_ptr<group_aggregate_value>(aggregator->new_instance()));
  } 

  // Skip Phases 1,2 when single threaded or more threads than rows.
  if ((n_threads > 1) && (m_size > n_threads)) {
    
    // Phase 1: Compute prefix-sums for each block.
    in_parallel([&](size_t thread_idx, size_t n_threads) {
      size_t start_row = thread_idx * m_size / n_threads; 
      size_t end_row = (thread_idx + 1) * m_size / n_threads;
      for (const auto& v: this->range_iterator(start_row, end_row)) {
        DASSERT_TRUE(thread_idx < aggregators.size());
        if (v != FLEX_UNDEFINED) {
          aggregators[thread_idx]->add_element_simple(v);
        }
      }
    });

    // Phase 2: Combine prefix-sum(s) at the end of each block.
    for (size_t i = n_threads - 1; i > 0; i--) {
      for (size_t j = 0; j < i; j++) {
        DASSERT_TRUE(i < aggregators.size());
        DASSERT_TRUE(j < aggregators.size());
        aggregators[i]->combine(*aggregators[j]);
      }
    }
  }
  
  // Phase 3: Reaggregate with an re-intialized prefix-sum from previous blocks. 
  auto reagg_fn = [&](size_t thread_idx, size_t n_threads) {
    flexible_type y = FLEX_UNDEFINED;
    size_t start_row = thread_idx * m_size / n_threads; 
    size_t end_row = (thread_idx + 1) * m_size / n_threads;
    std::shared_ptr<group_aggregate_value> re_aggregator (
                                              aggregator->new_instance());
  
    // Initialize with the merged value. 
    if (thread_idx >= 1) {
      DASSERT_TRUE(thread_idx - 1 < aggregators.size());
      y = aggregators[thread_idx - 1]->emit();
      re_aggregator->combine(*aggregators[thread_idx - 1]);
    }

    // Write prefix-sum
    for (const auto& v: this->range_iterator(start_row, end_row)) {
      if (v != FLEX_UNDEFINED) {
        re_aggregator->add_element_simple(v);
        y = re_aggregator->emit();
      }
      writer.write(y, thread_idx);
    }
  };
  
  // Run single threaded if more threads than rows. 
  if (m_size > n_threads) {
    in_parallel(reagg_fn);
  } else {
    reagg_fn(0, 1);   
  }
  return writer.close();
}