T reduce( GlobalAddress<T> localizable ) { CompletionEvent ce(cores() - 1); T total = *(localizable.localize()); Core origin = mycore(); for (Core c=0; c<cores(); c++) { if (c != origin) { send_heap_message(c, [localizable, &ce, &total, origin]{ T val = *(localizable.localize()); send_heap_message(origin, [val,&ce,&total] { total = ReduceOp(total, val); ce.complete(); }); }); } } ce.wait(); return total; }
void bucket_sort(GlobalAddress<S> array, size_t nelems, int (*Scmp)(const void*,const void*), size_t (*lobits)(S,int), int log2buckets, int log2maxkey ) { double t, sort_time, histogram_time, allreduce_time, scatter_time, local_sort_scatter_time, put_back_time; int LOBITS = log2maxkey - log2buckets; size_t nbuckets = 1 << log2buckets; GlobalAddress<bucket_t<S> > bucketlist = Grappa::global_alloc<bucket_t<S> >(nbuckets); #ifdef DEBUG for (size_t i=0; i<nbuckets; i++) { GlobalAddress<bucket_t<S> > bi = bucketlist+i; VLOG(1) << "bucket[" << i << "] on Node " << bi.core() << ", offset = " << bi.pointer() - bucketlist.localize(bi.core()) << ", ptr = " << bi.pointer(); } #endif sort_time = Grappa::walltime(); // initialize globals and histogram counts // { setup_counts f(array, nbuckets, bucketlist); fork_join_custom(&f); } on_all_cores([nbuckets]{ counts.resize(nbuckets); offsets.resize(nbuckets); for (size_t i=0; i<nbuckets; i++) { counts[i] = 0; } }); t = Grappa::walltime(); // do local bucket counts // forall_local<uint64_t,histogram>(array, nelems); forall(array, nelems, [lobits,LOBITS](int64_t i, S& v) { size_t b = lobits(v, LOBITS); // TODO decide how to compare general in pieces counts[b]++; }); histogram_time = Grappa::walltime() - t; LOG(INFO) << "histogram_time: " << histogram_time; t = Grappa::walltime(); // allreduce everyone's counts & compute global offsets (prefix sum) // { aggregate_counts f; fork_join_custom(&f); } on_all_cores([nbuckets] { CHECK_EQ(nbuckets, counts.size()); // all nodes get total counts put into their counts array allreduce_inplace<size_t,collective_add>(&counts[0], counts.size()); VLOG(1) << "after allreduce_inplace (just in case)"; // everyone computes prefix sum over buckets locally offsets[0] = 0; for (size_t i=1; i<offsets.size(); i++) { offsets[i] = offsets[i-1] + counts[i-1]; } }); allreduce_time = Grappa::walltime() - t; LOG(INFO) << "allreduce_time: " << allreduce_time; // allocate space in buckets VLOG(3) << "allocating space..."; // forall_local<bucket_t,init_buckets>(bucketlist, nbuckets); forall(bucketlist, nbuckets, [](int64_t id, bucket_t<S>& bucket){ // (global malloc doesn't call constructors) new (&bucket) bucket_t<S>(); bucket.reserve(counts[id]); }); VLOG(3) << "scattering..."; t = Grappa::walltime(); // scatter into buckets // forall_local<uint64_t,scatter>(array, nelems); forall(array, nelems, [bucketlist,lobits,LOBITS](int64_t s, int64_t n, S * first){ size_t nbuckets = counts.size(); for (int i=0; i<n; i++) { auto v = first[i]; size_t b = lobits(v, LOBITS); CHECK( b < nbuckets ) << "bucket id = " << b << ", nbuckets = " << nbuckets; // ff_delegate<bucket_t,uint64_t,ff_append>(bucketlist+b, v); auto destb = bucketlist+b; delegate::call<async>(destb.core(), [destb,v]{ destb.pointer()->append(v); }); } }); scatter_time = Grappa::walltime() - t; LOG(INFO) << "scatter_time: " << scatter_time; t = Grappa::walltime(); // sort buckets locally // forall_local<bucket_t,sort_bucket>(bucketlist, nbuckets); /// Do some kind of local serial sort of a bucket forall(bucketlist, nbuckets, [Scmp](int64_t bucket_id, bucket_t<S>& bucket){ if (bucket.size() == 0) return; qsort(&bucket[0], bucket.size(), sizeof(S), Scmp); }); local_sort_scatter_time = Grappa::walltime() - t; LOG(INFO) << "local_sort_time: " << local_sort_scatter_time; t = Grappa::walltime(); // redistribute buckets back into global array // forall_local<bucket_t,put_back_bucket>(bucketlist, nbuckets); /// Redistribute sorted buckets back into global array forall(bucketlist, nbuckets, [array,bucketlist](int64_t b, bucket_t<S>& bucket) { const size_t NBUF = BUFSIZE / sizeof(S); DCHECK( b < counts.size() ); // TODO: shouldn't need to buffer this, but a bug of some sort is currently forcing us to limit the number of outstanding messages //for_buffered(i, n, 0, bucket.size(), NBUF) { // typename Incoherent<S>::WO c(array+offsets[b]+i, n, &bucket[i]); // c.block_until_released(); // } if ( bucket.size() > 0 ) { typename Incoherent<S>::WO c(array+offsets[b], bucket.size(), &bucket[0]); // FIXME v is not in locale-shared-memory c.block_until_released(); } VLOG(3) << "bucket[" << b << "] release successful"; }); put_back_time = Grappa::walltime() - t; LOG(INFO) << "put_back_time: " << put_back_time; sort_time = Grappa::walltime() - sort_time; LOG(INFO) << "total_sort_time: " << sort_time; }