예제 #1
0
   T reduce( GlobalAddress<T> localizable ) {
     CompletionEvent ce(cores() - 1); 

     T total = *(localizable.localize());
     Core origin = mycore();

     for (Core c=0; c<cores(); c++) {
      if (c != origin) {
        send_heap_message(c, [localizable, &ce, &total, origin]{
          T val = *(localizable.localize());
          send_heap_message(origin, [val,&ce,&total] {
            total = ReduceOp(total, val);
            ce.complete();
          });
        });
      }
     }
    ce.wait();
    return total;
   }
예제 #2
0
파일: sort.hpp 프로젝트: HTOKORG/grappa
void bucket_sort(GlobalAddress<S> array, size_t nelems, int (*Scmp)(const void*,const void*), size_t (*lobits)(S,int), int log2buckets, int log2maxkey ) {
  double t, sort_time, histogram_time, allreduce_time, scatter_time, local_sort_scatter_time, put_back_time;

  int LOBITS = log2maxkey - log2buckets;
  size_t nbuckets = 1 << log2buckets;

  GlobalAddress<bucket_t<S> > bucketlist = Grappa::global_alloc<bucket_t<S> >(nbuckets);

#ifdef DEBUG
  for (size_t i=0; i<nbuckets; i++) {
    GlobalAddress<bucket_t<S> > bi = bucketlist+i;
    VLOG(1) << "bucket[" << i << "] on Node " << bi.core() << ", offset = " << bi.pointer() - bucketlist.localize(bi.core()) << ", ptr = " << bi.pointer();
  }
#endif

  sort_time = Grappa::walltime();

  // initialize globals and histogram counts
  // { setup_counts f(array, nbuckets, bucketlist); fork_join_custom(&f); }
  on_all_cores([nbuckets]{
    counts.resize(nbuckets);
    offsets.resize(nbuckets);
    for (size_t i=0; i<nbuckets; i++) {
      counts[i] = 0;
    }
  });

  t = Grappa::walltime();

  // do local bucket counts
  // forall_local<uint64_t,histogram>(array, nelems);
  forall(array, nelems, [lobits,LOBITS](int64_t i, S& v) {
    size_t b = lobits(v, LOBITS); // TODO decide how to compare general in pieces
    counts[b]++;
  });

  histogram_time = Grappa::walltime() - t;
  LOG(INFO) << "histogram_time: " << histogram_time;
  t = Grappa::walltime();

  // allreduce everyone's counts & compute global offsets (prefix sum)
  // { aggregate_counts f; fork_join_custom(&f); }
  on_all_cores([nbuckets] {
    CHECK_EQ(nbuckets, counts.size());
    // all nodes get total counts put into their counts array
    allreduce_inplace<size_t,collective_add>(&counts[0], counts.size());

    VLOG(1) << "after allreduce_inplace (just in case)";

    // everyone computes prefix sum over buckets locally
    offsets[0] = 0;
    for (size_t i=1; i<offsets.size(); i++) {
      offsets[i] = offsets[i-1] + counts[i-1];
    }
  });
  
  allreduce_time = Grappa::walltime() - t;
  LOG(INFO) << "allreduce_time: " << allreduce_time;

  // allocate space in buckets
  VLOG(3) << "allocating space...";
  // forall_local<bucket_t,init_buckets>(bucketlist, nbuckets);
  forall(bucketlist, nbuckets, [](int64_t id, bucket_t<S>& bucket){
    // (global malloc doesn't call constructors)
    new (&bucket) bucket_t<S>();
    bucket.reserve(counts[id]);
  });
  
  VLOG(3) << "scattering...";
      t = Grappa::walltime();

  // scatter into buckets
  // forall_local<uint64_t,scatter>(array, nelems);
  forall(array, nelems, [bucketlist,lobits,LOBITS](int64_t s, int64_t n, S * first){
    size_t nbuckets = counts.size();
    
    for (int i=0; i<n; i++) {
      auto v = first[i];
      size_t b = lobits(v, LOBITS);
      CHECK( b < nbuckets ) << "bucket id = " << b << ", nbuckets = " << nbuckets;
      // ff_delegate<bucket_t,uint64_t,ff_append>(bucketlist+b, v);
      auto destb = bucketlist+b;
      delegate::call<async>(destb.core(), [destb,v]{
        destb.pointer()->append(v);
      });
    }
  });
    
  scatter_time = Grappa::walltime() - t;
  LOG(INFO) << "scatter_time: " << scatter_time;
  t = Grappa::walltime();

  // sort buckets locally
  // forall_local<bucket_t,sort_bucket>(bucketlist, nbuckets);
  /// Do some kind of local serial sort of a bucket
  forall(bucketlist, nbuckets, [Scmp](int64_t bucket_id, bucket_t<S>& bucket){
    if (bucket.size() == 0) return;
    qsort(&bucket[0], bucket.size(), sizeof(S), Scmp);
  });

  local_sort_scatter_time = Grappa::walltime() - t;
  LOG(INFO) << "local_sort_time: " << local_sort_scatter_time;  
  t = Grappa::walltime(); 
  
  // redistribute buckets back into global array  
  // forall_local<bucket_t,put_back_bucket>(bucketlist, nbuckets);
  /// Redistribute sorted buckets back into global array
  forall(bucketlist, nbuckets, [array,bucketlist](int64_t b, bucket_t<S>& bucket) {
    const size_t NBUF = BUFSIZE / sizeof(S);
    DCHECK( b < counts.size() );

    // TODO: shouldn't need to buffer this, but a bug of some sort is currently forcing us to limit the number of outstanding messages    
    //for_buffered(i, n, 0, bucket.size(), NBUF) {
    //  typename Incoherent<S>::WO c(array+offsets[b]+i, n, &bucket[i]);
    //  c.block_until_released();
    // }
    
    if ( bucket.size() > 0 ) {
      typename Incoherent<S>::WO c(array+offsets[b], bucket.size(), &bucket[0]); // FIXME v is not in locale-shared-memory
      c.block_until_released();
    }

    VLOG(3) << "bucket[" << b << "] release successful";
  });
  
  put_back_time = Grappa::walltime() - t;
  LOG(INFO) << "put_back_time: " << put_back_time;
  
  sort_time = Grappa::walltime() - sort_time;
  LOG(INFO) << "total_sort_time: " << sort_time;
}