T read(GlobalAddress<T> target) { delegate_reads++; return call<S,C>(target.core(), [target]() -> T { delegate_read_targets++; return *target.pointer(); }); }
bool GlobalQueue<T>::push( GlobalAddress<T> chunk_base, uint64_t chunk_amount ) { CHECK( initialized ); DVLOG(5) << "push() base:" << chunk_base << " amount:" << chunk_amount; GlobalAddress< QueueEntry<T> > loc = Grappa_delegate_func< bool, GlobalAddress< QueueEntry<T> >, GlobalQueue<T>::push_reserve_g > ( false, HOME_NODE ); size_t msg_bytes = Grappa_sizeof_delegate_func_request< bool, GlobalAddress< QueueEntry<T> > >( ); DVLOG(5) << "push() reserve done -- loc:" << loc; if ( loc.pointer() == NULL ) { Grappa::Metrics::global_queue_stats.record_push_reserve_request( msg_bytes, false ); // no space in global queue; push failed return false; } Grappa::Metrics::global_queue_stats.record_push_reserve_request( msg_bytes, true ); // push the queue entry that points to my chunk ChunkInfo<T> c; c.base = chunk_base; c.amount = chunk_amount; push_entry_args<T> entry_args; entry_args.target = loc; entry_args.chunk = c; DVLOG(5) << "push() sending entry to " << loc; bool had_sleeper = Grappa_delegate_func< push_entry_args<T>, bool, GlobalQueue<T>::push_entry_g > ( entry_args, loc.core() ); size_t entry_msg_bytes = Grappa_sizeof_delegate_func_request< push_entry_args<T>, bool >( ); Grappa::Metrics::global_queue_stats.record_push_entry_request( entry_msg_bytes, had_sleeper ); return true; }
void update( K key, UV val ) { uint64_t index = computeIndex( key ); GlobalAddress< Cell > target = base + index; Grappa::delegate::call( target.core(), [key, val, target]() { // TODO: upgrade to call_async; using GCE // list of entries in this cell std::list<DHT_TYPE(Entry)> * entries = target.pointer()->entries; // if first time the cell is hit then initialize if ( entries == NULL ) { entries = new std::list<Entry>(); target.pointer()->entries = entries; } // find matching key in the list typename std::list<DHT_TYPE(Entry)>::iterator i; for (i = entries->begin(); i!=entries->end(); ++i) { if ( i->key == key ) { // key found so update i->value = UpF(i->value, val); hash_tables_size+=1; return 0; } } // this is the first time the key has been seen // so add it to the list Entry newe( key, UpF(Init, val)); return 0; }); }
T readFF(GlobalAddress<FullEmpty<T>> fe_addr) { if (fe_addr.core() == mycore()) { DVLOG(2) << "local"; return fe_addr.pointer()->readFF(); } FullEmpty<T> result; auto result_addr = make_global(&result); send_message(fe_addr.core(), [fe_addr,result_addr]{ auto& fe = *fe_addr.pointer(); if (fe.full()) { // DVLOG(2) << "no need to block"; fill_remote(result_addr, fe.readFF()); return; } DVLOG(2) << "setting up to block (" << fe_addr << ")"; auto* c = SuspendedDelegate::create([&fe,result_addr]{ VLOG(0) << "suspended_delegate!"; fill_remote(result_addr, fe.readFF()); }); add_waiter(&fe, c); }); return result.readFF(); }
void increment(GlobalAddress<T> target, U inc) { static_assert(std::is_convertible<T,U>(), "type of inc must match GlobalAddress type"); delegate_async_increments++; delegate::call<SyncMode::Async,C>(target.core(), [target,inc]{ (*target.pointer()) += inc; }); }
inline void lock( GlobalAddress<Mutex> m ) { // if local, just acquire if( m.core() == mycore() ) { lock( m.pointer() ); } else { // if remote, spawn a task on the home node to acquire CHECK(false); } }
void write(GlobalAddress<T> target, U value) { static_assert(std::is_convertible<T,U>(), "type of value must match GlobalAddress type"); delegate_writes++; // TODO: don't return any val, requires changes to `delegate::call()`. return call<S,C>(target.core(), [target, value] { delegate_write_targets++; *target.pointer() = value; }); }
T fetch_and_add(GlobalAddress<T> target, U inc) { delegate_fetchadds++; return call(target.core(), [target, inc]() -> T { delegate_fetchadd_targets++; T* p = target.pointer(); T r = *p; *p += inc; return r; }); }
/// delegate malloc static GlobalAddress< void > remote_malloc( size_t size_bytes ) { // ask node 0 to allocate memory auto allocated_address = Grappa::impl::call( 0, [size_bytes] { DVLOG(5) << "got malloc request for size " << size_bytes; GlobalAddress< void > a = global_allocator->local_malloc( size_bytes ); DVLOG(5) << "malloc returning pointer " << a.pointer(); return a; }); return allocated_address; }
NullAcquirer(GlobalAddress<T> * request_address, size_t * count, T** pointer) : request_address_(request_address), count_(count), pointer_(pointer) { VLOG(6) << "pointer = " << pointer << ", pointer_ = " << pointer_; if( count == 0 ) { DVLOG(5) << "Zero-length acquire"; *pointer_ = NULL; } else if( request_address_->is_2D() && request_address_->core() == Grappa::mycore() ) { DVLOG(5) << "Short-circuiting to address " << request_address_->pointer(); *pointer_ = request_address_->pointer(); } }
void lookup ( K key, CF f ) { uint64_t index = computeIndex( key ); GlobalAddress< Cell > target = base + index; // FIXME: remove 'this' capture when using gcc4.8, this is just a bug in 4.7 //TODO optimization where only need to do remotePrivateTask instead of call_async //if you are going to do more suspending ops (comms) inside the loop Grappa::spawnRemote<GCE>( target.core(), [key, target, f, this]() { Entry e; if (lookup_local( key, target.pointer(), &e)) { f(e.value); } }); }
bool compare_and_swap(GlobalAddress<T> target, U cmp_val, V new_val) { static_assert(std::is_convertible<T,U>(), "type of cmp_val must match GlobalAddress type"); static_assert(std::is_convertible<T,V>(), "type of new_val must match GlobalAddress type"); delegate_cmpswaps++; return call(target.core(), [target, cmp_val, new_val]() -> bool { T * p = target.pointer(); delegate_cmpswap_targets++; if (cmp_val == *p) { *p = new_val; return true; } else { return false; } }); }
double perf_test(GlobalAddress<GlobalVector<int64_t>> qa) { double t = Grappa::walltime(); forall(0, FLAGS_nelems, [qa](int64_t i){ if (EXP == Exp::QUEUE) { if (choose_random(FLAGS_fraction_push)) { qa->push(next_random<int64_t>()); } else { qa->dequeue(); } } else if (EXP == Exp::STACK) { if (choose_random(FLAGS_fraction_push)) { qa->push(next_random<int64_t>()); } else { // qa->pop(); CHECK_GT(qa->pop(), -1); } } else if (EXP == Exp::PUSH) { qa->push(next_random<int64_t>()); } else if (EXP == Exp::POP) { qa->pop(); } else if (EXP == Exp::DEQUEUE) { qa->dequeue(); } }); t = Grappa::walltime() - t; return t; }
void push(const T& o) { CHECK(target_array.pointer() != NULL) << "buffer not initialized!"; buf[curr_buf][curr_size[curr_buf]] = o; curr_size[curr_buf]++; CHECK(curr_size[curr_buf] <= BUFSIZE); if (curr_size[curr_buf] == BUFSIZE) { flush(); } }
void reset( ) { DVLOG(5) << "In " << __PRETTY_FUNCTION__; CHECK( !acquire_started_ || acquired_ ) << "inconsistent state for reset"; acquire_started_ = false; acquired_ = false; thread_ = NULL; num_messages_ = 0; response_count_ = 0; expected_reply_payload_ = sizeof( T ) * *count_; total_reply_payload_ = 0; start_time_ = 0; network_time_ = 0; if( *count_ == 0 ) { DVLOG(5) << "Zero-length acquire"; *pointer_ = NULL; acquire_started_ = true; acquired_ = true; } else if( request_address_->is_2D() ) { num_messages_ = 1; if( request_address_->core() == Grappa::mycore() ) { DVLOG(5) << "Short-circuiting to address " << request_address_->pointer(); *pointer_ = request_address_->pointer(); acquire_started_ = true; acquired_ = true; } } else { DVLOG(5) << "Straddle: block_max is " << (*request_address_ + *count_).block_max() ; DVLOG(5) << ", request_address is " << *request_address_; DVLOG(5) << ", sizeof(T) is " << sizeof(T); DVLOG(5) << ", count is " << *count_; DVLOG(5) << ", block_min is " << request_address_->block_min(); DVLOG(5) << "Straddle: address is " << *request_address_ ; DVLOG(5) << ", address + count is " << *request_address_ + *count_; ptrdiff_t byte_diff = ( (*request_address_ + *count_ - 1).last_byte().block_max() - request_address_->first_byte().block_min() ); DVLOG(5) << "Straddle: address block max is " << request_address_->block_max(); DVLOG(5) << " address + count block max is " << (*request_address_ + *count_).block_max(); DVLOG(5) << " address block min " << request_address_->block_min(); DVLOG(5) << "Straddle: diff is " << byte_diff << " bs " << block_size; num_messages_ = byte_diff / block_size; } if( num_messages_ > 1 ) DVLOG(5) << "****************************** MULTI BLOCK CACHE REQUEST ******************************"; DVLOG(5) << "In " << __PRETTY_FUNCTION__ << "; detecting straddle for sizeof(T):" << sizeof(T) << " count:" << *count_ << " num_messages_:" << num_messages_ << " request_address:" << *request_address_; }
T reduce( GlobalAddress<T> localizable ) { CompletionEvent ce(cores() - 1); T total = *(localizable.localize()); Core origin = mycore(); for (Core c=0; c<cores(); c++) { if (c != origin) { send_heap_message(c, [localizable, &ce, &total, origin]{ T val = *(localizable.localize()); send_heap_message(origin, [val,&ce,&total] { total = ReduceOp(total, val); ce.complete(); }); }); } } ce.wait(); return total; }
static double get_edge_weight(GlobalAddress<G> g, int64_t i, int64_t j) { return delegate::call(g->vs+i, [=](Vertex& v){ for (int k = 0; v.nadj; k++) { auto e = g->edge(v,k); if (e.id == j) return e->weight; } // we can not reach this, possibly better to throw exception return 0.0; }); }
bool lookup ( K key, V * val ) { uint64_t index = computeIndex( key ); GlobalAddress< Cell > target = base + index; // FIXME: remove 'this' capture when using gcc4.8, this is just a bug in 4.7 lookup_result result = Grappa::delegate::call( target.core(), [key,target,this]() { DHT_TYPE(lookup_result) lr; Entry e; if (lookup_local( key, target.pointer(), &e)) { lr.valid = true; lr.result = e.value; } return lr; }); *val = result.result; return result.valid; }
void do_release() { size_t total_bytes = *count_ * sizeof(T); RequestArgs args; args.request_address = *request_address_; DVLOG(5) << "Computing request_bytes from block_max " << request_address_->first_byte().block_max() << " and " << *request_address_; args.reply_address = make_global( this ); size_t offset = 0; size_t request_bytes = 0; for( size_t i = 0; offset < total_bytes; offset += request_bytes, i++) { request_bytes = args.request_address.first_byte().block_max() - args.request_address.first_byte(); if( request_bytes > total_bytes - offset ) { request_bytes = total_bytes - offset; } DVLOG(5) << "sending release request with " << request_bytes << " of total bytes = " << *count_ * sizeof(T) << " to " << args.request_address; Grappa::send_heap_message(args.request_address.core(), [args](void * payload, size_t payload_size) { IRMetrics::count_release_ams( payload_size ); DVLOG(5) << "Worker " << Grappa::current_worker() << " received release request to " << args.request_address << " reply to " << args.reply_address; memcpy( args.request_address.pointer(), payload, payload_size ); auto reply_address = args.reply_address; Grappa::send_heap_message(args.reply_address.core(), [reply_address]{ DVLOG(5) << "Worker " << Grappa::current_worker() << " received release reply to " << reply_address; reply_address.pointer()->release_reply(); }); DVLOG(5) << "Worker " << Grappa::current_worker() << " sent release reply to " << reply_address; }, (char*)(*pointer_) + offset, request_bytes ); // TODO: change type so we don't screw with pointer like this args.request_address = GlobalAddress<T>::Raw( args.request_address.raw_bits() + request_bytes ); } DVLOG(5) << "release started for " << args.request_address; // blocks here waiting for messages to be sent }
T fetch_and_add( U inc ) { block_until_ready(); // fetch add unit is now aggregating so add my inc participant_count++; committed--; increment += inc; // if I'm the last entered client and either the flush threshold // is reached or there are no more committed participants then start the flush if ( ready_waiters == 0 && (participant_count >= flush_threshold || committed == 0 )) { set_not_ready(); uint64_t increment_total = increment; flat_combiner_fetch_and_add_amount += increment_total; auto t = target; result = call(target.core(), [t, increment_total]() -> U { T * p = t.pointer(); uint64_t r = *p; *p += increment_total; return r; }); // tell the others that the result has arrived Grappa::broadcast(&untilReceived); } else { // someone else will start the flush Grappa::wait(&untilReceived); } uint64_t my_start = result; result += inc; participant_count--; increment -= inc; // for validation purposes (could just set to 0) if ( participant_count == 0 ) { CHECK( increment == 0 ) << "increment = " << increment << " even though all participants are done"; set_ready(); } return my_start; }
void reset( ) { CHECK( !release_started_ || released_ ) << "inconsistent state for reset"; release_started_ = false; released_ = false; thread_ = NULL; num_messages_ = 0; response_count_ = 0; if( *count_ == 0 ) { DVLOG(5) << "Zero-length release"; release_started_ = true; released_ = true; } else if( request_address_->is_2D() ) { num_messages_ = 1; if( request_address_->core() == Grappa::mycore() ) { release_started_ = true; released_ = true; } } else { DVLOG(5) << "Straddle: block_max is " << (*request_address_ + *count_).block_max() ; DVLOG(5) << ", request_address is " << *request_address_; DVLOG(5) << ", sizeof(T) is " << sizeof(T); DVLOG(5) << ", count is " << *count_; DVLOG(5) << ", block_min is " << request_address_->block_min(); DVLOG(5) << "Straddle: address is " << *request_address_ ; DVLOG(5) << ", address + count is " << *request_address_ + *count_; ptrdiff_t byte_diff = ( (*request_address_ + *count_ - 1).last_byte().block_max() - request_address_->first_byte().block_min() ); DVLOG(5) << "Straddle: address block max is " << request_address_->block_max(); DVLOG(5) << " address + count block max is " << (*request_address_ + *count_).block_max(); DVLOG(5) << " address + count -1 block max is " << (*request_address_ + *count_ - 1).block_max(); DVLOG(5) << " difference is " << ( (*request_address_ + *count_ - 1).block_max() - request_address_->block_min() ); DVLOG(5) << " multiplied difference is " << ( (*request_address_ + *count_ - 1).block_max() - request_address_->block_min() ) * sizeof(T); DVLOG(5) << " address block min " << request_address_->block_min(); DVLOG(5) << "Straddle: diff is " << byte_diff << " bs " << block_size; num_messages_ = byte_diff / block_size; } if( num_messages_ > 1 ) DVLOG(5) << "****************************** MULTI BLOCK CACHE REQUEST ******************************"; DVLOG(5) << "Detecting straddle for sizeof(T):" << sizeof(T) << " count:" << *count_ << " num_messages_:" << num_messages_ << " request_address:" << *request_address_; }
/// Overload to work on GlobalAddresses. // template<typename CompletionType> // inline void complete(GlobalAddress<CompletionType> ce, int64_t decr = 1) { // static_assert(std::is_base_of<CompletionEvent,CompletionType>::value, // "complete() can only be called on subclasses of CompletionEvent"); inline void complete(GlobalAddress<CompletionEvent> ce, int64_t decr = 1) { DVLOG(5) << "complete CompletionEvent"; if (ce.core() == mycore()) { ce.pointer()->complete(decr); } else { ce_remote_completions += decr; if (decr == 1) { // (common case) don't send full 8 bytes just to decrement by 1 send_heap_message(ce.core(), [ce] { ce.pointer()->complete(); }); } else { send_heap_message(ce.core(), [ce,decr] { ce.pointer()->complete(decr); }); } } }
void activate(GlobalAddress<V> v) { delegate::call(v, [](V& v){ v->activate(); }); }
void bfs(GlobalAddress<G> _g, int nbfs, TupleGraph tg) { bool verified = false; double t; auto _frontier = GlobalBag<VertexID>::create(_g->nv); auto _next = GlobalBag<VertexID>::create(_g->nv); call_on_all_cores([=]{ frontier = _frontier; next = _next; g = _g; }); // do BFS from multiple different roots and average their times for (int root_idx = 0; root_idx < nbfs; root_idx++) { // intialize parent to -1 forall(g, [](G::Vertex& v){ v->init(); v->level = -1; }); VertexID root; if (FLAGS_max_degree_source) { forall(g, [](VertexID i, G::Vertex& v){ max_degree << MaxDegree(i, v.nadj); }); root = static_cast<MaxDegree>(max_degree).idx(); } else { root = choose_root(g); } // setup 'root' as the parent of itself delegate::call(g->vs+root, [=](G::Vertex& v){ v->parent = root; v->level = 0; }); // reset frontier queues next->clear(); frontier->clear(); // start with root as only thing in frontier delegate::call((g->vs+root).core(), [=]{ frontier->add(root); }); t = walltime(); bool top_down = true; int64_t prev_nf = -1; int64_t frontier_edges = 0; int64_t remaining_edges = g->nadj; while (!frontier->empty()) { auto nf = frontier->size(); VLOG(1) << "remaining_edges = " << remaining_edges << ", nf = " << nf << ", prev_nf = " << prev_nf << ", frontier_edges: " ; if (top_down && frontier_edges > remaining_edges/FLAGS_beamer_alpha && nf > prev_nf) { VLOG(1) << "switching to bottom-up"; top_down = false; } else if (!top_down && frontier_edges < g->nv/FLAGS_beamer_beta && nf < prev_nf) { VLOG(1) << "switching to top-down"; top_down = true; } edge_count = 0; if (top_down) { // iterate over vertices in this level of the frontier forall(frontier, [](VertexID& i){ // visit all the adjacencies of the vertex // note: this has to be 'async' to prevent deadlock from // running out of available workers forall<async>(adj(g,i), [i](G::Edge& e) { auto j = e.id; // at the core where the vertex is... delegate::call<async>(e.ga, [i,j](G::Vertex& vj){ // note: no synchronization needed because 'call' is // guaranteed to be executed atomically because it // does no blocking operations if (vj->parent == -1) { // claim parenthood vj->parent = i; vj->level = current_depth; next->add(j); edge_count += vj.nadj; } }); }); }); } else { // bottom-up forall<&phaser>(g, [](G::Vertex& v){ if (v->level != -1) return; auto va = make_linear(&v); forall<async,&phaser>(adj(g,v), [=,&v](G::Edge& e){ if (v->level != -1) return; phaser.enroll(); auto eva = e.ga; send_heap_message(eva.core(), [=]{ auto& ev = *eva.pointer(); if (ev->level != -1 && ev->level < current_depth) { auto eid = g->id(ev); send_heap_message(va.core(), [=]{ auto& v = *va.pointer(); if (v->level == -1) { next->add(g->id(v)); v->level = current_depth; v->parent = eid; edge_count += v.nadj; } phaser.complete(); }); } else { phaser.send_completion(va.core()); } }); }); }); } call_on_all_cores([=]{ current_depth++; // switch to next frontier level std::swap(frontier, next); }); next->clear(); frontier_edges = edge_count; remaining_edges -= frontier_edges; prev_nf = nf; } // while (frontier not empty) double this_bfs_time = walltime() - t; LOG(INFO) << "(root=" << root << ", time=" << this_bfs_time << ")"; if (!verified) { // only verify the first one to save time t = walltime(); bfs_nedge = verify(tg, g, root); verify_time = (walltime()-t); LOG(INFO) << verify_time; verified = true; Metrics::reset_all_cores(); // don't count the first one } else { total_time += this_bfs_time; } bfs_mteps += bfs_nedge / this_bfs_time / 1.0e6; } }
void fill_remote(GlobalAddress<FullEmpty<T>> result_addr, const T& val) { send_heap_message(result_addr.core(), [result_addr,val]{ result_addr->writeXF(val); }); }
inline void enroll(GlobalAddress<CompletionEvent> ce, int64_t incr = 1) { impl::call(ce.core(), [ce,incr]{ ce->enroll(incr); }); }
void do_acquire() { size_t total_bytes = *count_ * sizeof(T); RequestArgs args; args.request_address = *request_address_; DVLOG(5) << "Computing request_bytes from block_max " << request_address_->first_byte().block_max() << " and " << *request_address_; args.reply_address = make_global( this ); args.offset = 0; for(size_t i = 0; args.offset < total_bytes; args.offset += args.request_bytes, i++) { args.request_bytes = args.request_address.first_byte().block_max() - args.request_address.first_byte(); if( args.request_bytes > total_bytes - args.offset ) { args.request_bytes = total_bytes - args.offset; } DVLOG(5) << "sending acquire request for " << args.request_bytes << " of total bytes = " << *count_ * sizeof(T) << " from " << args.request_address; Grappa::send_heap_message(args.request_address.core(), [args]{ IAMetrics::count_acquire_ams( args.request_bytes ); DVLOG(5) << "Worker " << Grappa::current_worker() << " received acquire request to " << args.request_address << " size " << args.request_bytes << " offset " << args.offset << " reply to " << args.reply_address; DVLOG(5) << "Worker " << Grappa::current_worker() << " sending acquire reply to " << args.reply_address << " offset " << args.offset << " request address " << args.request_address << " payload address " << args.request_address.pointer() << " payload size " << args.request_bytes; // note: this will read the payload *later* when the message is copied into the actual send buffer, // should be okay because we're already assuming DRF, but something to watch out for auto reply_address = args.reply_address; auto offset = args.offset; Grappa::send_heap_message(args.reply_address.core(), [reply_address, offset](void * payload, size_t payload_size) { DVLOG(5) << "Worker " << Grappa::current_worker() << " received acquire reply to " << reply_address << " offset " << offset << " payload size " << payload_size; reply_address.pointer()->acquire_reply( offset, payload, payload_size); }, args.request_address.pointer(), args.request_bytes ); DVLOG(5) << "Worker " << Grappa::current_worker() << " sent acquire reply to " << args.reply_address << " offset " << args.offset << " request address " << args.request_address << " payload address " << args.request_address.pointer() << " payload size " << args.request_bytes; }); // TODO: change type so we don't screw with pointer like this args.request_address = GlobalAddress<T>::Raw( args.request_address.raw_bits() + args.request_bytes ); } DVLOG(5) << "acquire started for " << args.request_address; }
void bucket_sort(GlobalAddress<S> array, size_t nelems, int (*Scmp)(const void*,const void*), size_t (*lobits)(S,int), int log2buckets, int log2maxkey ) { double t, sort_time, histogram_time, allreduce_time, scatter_time, local_sort_scatter_time, put_back_time; int LOBITS = log2maxkey - log2buckets; size_t nbuckets = 1 << log2buckets; GlobalAddress<bucket_t<S> > bucketlist = Grappa::global_alloc<bucket_t<S> >(nbuckets); #ifdef DEBUG for (size_t i=0; i<nbuckets; i++) { GlobalAddress<bucket_t<S> > bi = bucketlist+i; VLOG(1) << "bucket[" << i << "] on Node " << bi.core() << ", offset = " << bi.pointer() - bucketlist.localize(bi.core()) << ", ptr = " << bi.pointer(); } #endif sort_time = Grappa::walltime(); // initialize globals and histogram counts // { setup_counts f(array, nbuckets, bucketlist); fork_join_custom(&f); } on_all_cores([nbuckets]{ counts.resize(nbuckets); offsets.resize(nbuckets); for (size_t i=0; i<nbuckets; i++) { counts[i] = 0; } }); t = Grappa::walltime(); // do local bucket counts // forall_local<uint64_t,histogram>(array, nelems); forall(array, nelems, [lobits,LOBITS](int64_t i, S& v) { size_t b = lobits(v, LOBITS); // TODO decide how to compare general in pieces counts[b]++; }); histogram_time = Grappa::walltime() - t; LOG(INFO) << "histogram_time: " << histogram_time; t = Grappa::walltime(); // allreduce everyone's counts & compute global offsets (prefix sum) // { aggregate_counts f; fork_join_custom(&f); } on_all_cores([nbuckets] { CHECK_EQ(nbuckets, counts.size()); // all nodes get total counts put into their counts array allreduce_inplace<size_t,collective_add>(&counts[0], counts.size()); VLOG(1) << "after allreduce_inplace (just in case)"; // everyone computes prefix sum over buckets locally offsets[0] = 0; for (size_t i=1; i<offsets.size(); i++) { offsets[i] = offsets[i-1] + counts[i-1]; } }); allreduce_time = Grappa::walltime() - t; LOG(INFO) << "allreduce_time: " << allreduce_time; // allocate space in buckets VLOG(3) << "allocating space..."; // forall_local<bucket_t,init_buckets>(bucketlist, nbuckets); forall(bucketlist, nbuckets, [](int64_t id, bucket_t<S>& bucket){ // (global malloc doesn't call constructors) new (&bucket) bucket_t<S>(); bucket.reserve(counts[id]); }); VLOG(3) << "scattering..."; t = Grappa::walltime(); // scatter into buckets // forall_local<uint64_t,scatter>(array, nelems); forall(array, nelems, [bucketlist,lobits,LOBITS](int64_t s, int64_t n, S * first){ size_t nbuckets = counts.size(); for (int i=0; i<n; i++) { auto v = first[i]; size_t b = lobits(v, LOBITS); CHECK( b < nbuckets ) << "bucket id = " << b << ", nbuckets = " << nbuckets; // ff_delegate<bucket_t,uint64_t,ff_append>(bucketlist+b, v); auto destb = bucketlist+b; delegate::call<async>(destb.core(), [destb,v]{ destb.pointer()->append(v); }); } }); scatter_time = Grappa::walltime() - t; LOG(INFO) << "scatter_time: " << scatter_time; t = Grappa::walltime(); // sort buckets locally // forall_local<bucket_t,sort_bucket>(bucketlist, nbuckets); /// Do some kind of local serial sort of a bucket forall(bucketlist, nbuckets, [Scmp](int64_t bucket_id, bucket_t<S>& bucket){ if (bucket.size() == 0) return; qsort(&bucket[0], bucket.size(), sizeof(S), Scmp); }); local_sort_scatter_time = Grappa::walltime() - t; LOG(INFO) << "local_sort_time: " << local_sort_scatter_time; t = Grappa::walltime(); // redistribute buckets back into global array // forall_local<bucket_t,put_back_bucket>(bucketlist, nbuckets); /// Redistribute sorted buckets back into global array forall(bucketlist, nbuckets, [array,bucketlist](int64_t b, bucket_t<S>& bucket) { const size_t NBUF = BUFSIZE / sizeof(S); DCHECK( b < counts.size() ); // TODO: shouldn't need to buffer this, but a bug of some sort is currently forcing us to limit the number of outstanding messages //for_buffered(i, n, 0, bucket.size(), NBUF) { // typename Incoherent<S>::WO c(array+offsets[b]+i, n, &bucket[i]); // c.block_until_released(); // } if ( bucket.size() > 0 ) { typename Incoherent<S>::WO c(array+offsets[b], bucket.size(), &bucket[0]); // FIXME v is not in locale-shared-memory c.block_until_released(); } VLOG(3) << "bucket[" << b << "] release successful"; }); put_back_time = Grappa::walltime() - t; LOG(INFO) << "put_back_time: " << put_back_time; sort_time = Grappa::walltime() - sort_time; LOG(INFO) << "total_sort_time: " << sort_time; }
// release data at pointer in local heap /// (should be called only on node responsible for allocator) void local_free( GlobalAddress< void > address ) { void * va = reinterpret_cast< void * >( address.raw_bits() ); a_p_->free( va ); }