T readFF(GlobalAddress<FullEmpty<T>> fe_addr) { if (fe_addr.core() == mycore()) { DVLOG(2) << "local"; return fe_addr.pointer()->readFF(); } FullEmpty<T> result; auto result_addr = make_global(&result); send_message(fe_addr.core(), [fe_addr,result_addr]{ auto& fe = *fe_addr.pointer(); if (fe.full()) { // DVLOG(2) << "no need to block"; fill_remote(result_addr, fe.readFF()); return; } DVLOG(2) << "setting up to block (" << fe_addr << ")"; auto* c = SuspendedDelegate::create([&fe,result_addr]{ VLOG(0) << "suspended_delegate!"; fill_remote(result_addr, fe.readFF()); }); add_waiter(&fe, c); }); return result.readFF(); }
void increment(GlobalAddress<T> target, U inc) { static_assert(std::is_convertible<T,U>(), "type of inc must match GlobalAddress type"); delegate_async_increments++; delegate::call<SyncMode::Async,C>(target.core(), [target,inc]{ (*target.pointer()) += inc; }); }
void update( K key, UV val ) { uint64_t index = computeIndex( key ); GlobalAddress< Cell > target = base + index; Grappa::delegate::call( target.core(), [key, val, target]() { // TODO: upgrade to call_async; using GCE // list of entries in this cell std::list<DHT_TYPE(Entry)> * entries = target.pointer()->entries; // if first time the cell is hit then initialize if ( entries == NULL ) { entries = new std::list<Entry>(); target.pointer()->entries = entries; } // find matching key in the list typename std::list<DHT_TYPE(Entry)>::iterator i; for (i = entries->begin(); i!=entries->end(); ++i) { if ( i->key == key ) { // key found so update i->value = UpF(i->value, val); hash_tables_size+=1; return 0; } } // this is the first time the key has been seen // so add it to the list Entry newe( key, UpF(Init, val)); return 0; }); }
T read(GlobalAddress<T> target) { delegate_reads++; return call<S,C>(target.core(), [target]() -> T { delegate_read_targets++; return *target.pointer(); }); }
inline void lock( GlobalAddress<Mutex> m ) { // if local, just acquire if( m.core() == mycore() ) { lock( m.pointer() ); } else { // if remote, spawn a task on the home node to acquire CHECK(false); } }
/// Overload to work on GlobalAddresses. // template<typename CompletionType> // inline void complete(GlobalAddress<CompletionType> ce, int64_t decr = 1) { // static_assert(std::is_base_of<CompletionEvent,CompletionType>::value, // "complete() can only be called on subclasses of CompletionEvent"); inline void complete(GlobalAddress<CompletionEvent> ce, int64_t decr = 1) { DVLOG(5) << "complete CompletionEvent"; if (ce.core() == mycore()) { ce.pointer()->complete(decr); } else { ce_remote_completions += decr; if (decr == 1) { // (common case) don't send full 8 bytes just to decrement by 1 send_heap_message(ce.core(), [ce] { ce.pointer()->complete(); }); } else { send_heap_message(ce.core(), [ce,decr] { ce.pointer()->complete(decr); }); } } }
void write(GlobalAddress<T> target, U value) { static_assert(std::is_convertible<T,U>(), "type of value must match GlobalAddress type"); delegate_writes++; // TODO: don't return any val, requires changes to `delegate::call()`. return call<S,C>(target.core(), [target, value] { delegate_write_targets++; *target.pointer() = value; }); }
T fetch_and_add(GlobalAddress<T> target, U inc) { delegate_fetchadds++; return call(target.core(), [target, inc]() -> T { delegate_fetchadd_targets++; T* p = target.pointer(); T r = *p; *p += inc; return r; }); }
void reset( ) { DVLOG(5) << "In " << __PRETTY_FUNCTION__; CHECK( !acquire_started_ || acquired_ ) << "inconsistent state for reset"; acquire_started_ = false; acquired_ = false; thread_ = NULL; num_messages_ = 0; response_count_ = 0; expected_reply_payload_ = sizeof( T ) * *count_; total_reply_payload_ = 0; start_time_ = 0; network_time_ = 0; if( *count_ == 0 ) { DVLOG(5) << "Zero-length acquire"; *pointer_ = NULL; acquire_started_ = true; acquired_ = true; } else if( request_address_->is_2D() ) { num_messages_ = 1; if( request_address_->core() == Grappa::mycore() ) { DVLOG(5) << "Short-circuiting to address " << request_address_->pointer(); *pointer_ = request_address_->pointer(); acquire_started_ = true; acquired_ = true; } } else { DVLOG(5) << "Straddle: block_max is " << (*request_address_ + *count_).block_max() ; DVLOG(5) << ", request_address is " << *request_address_; DVLOG(5) << ", sizeof(T) is " << sizeof(T); DVLOG(5) << ", count is " << *count_; DVLOG(5) << ", block_min is " << request_address_->block_min(); DVLOG(5) << "Straddle: address is " << *request_address_ ; DVLOG(5) << ", address + count is " << *request_address_ + *count_; ptrdiff_t byte_diff = ( (*request_address_ + *count_ - 1).last_byte().block_max() - request_address_->first_byte().block_min() ); DVLOG(5) << "Straddle: address block max is " << request_address_->block_max(); DVLOG(5) << " address + count block max is " << (*request_address_ + *count_).block_max(); DVLOG(5) << " address block min " << request_address_->block_min(); DVLOG(5) << "Straddle: diff is " << byte_diff << " bs " << block_size; num_messages_ = byte_diff / block_size; } if( num_messages_ > 1 ) DVLOG(5) << "****************************** MULTI BLOCK CACHE REQUEST ******************************"; DVLOG(5) << "In " << __PRETTY_FUNCTION__ << "; detecting straddle for sizeof(T):" << sizeof(T) << " count:" << *count_ << " num_messages_:" << num_messages_ << " request_address:" << *request_address_; }
NullAcquirer(GlobalAddress<T> * request_address, size_t * count, T** pointer) : request_address_(request_address), count_(count), pointer_(pointer) { VLOG(6) << "pointer = " << pointer << ", pointer_ = " << pointer_; if( count == 0 ) { DVLOG(5) << "Zero-length acquire"; *pointer_ = NULL; } else if( request_address_->is_2D() && request_address_->core() == Grappa::mycore() ) { DVLOG(5) << "Short-circuiting to address " << request_address_->pointer(); *pointer_ = request_address_->pointer(); } }
void lookup ( K key, CF f ) { uint64_t index = computeIndex( key ); GlobalAddress< Cell > target = base + index; // FIXME: remove 'this' capture when using gcc4.8, this is just a bug in 4.7 //TODO optimization where only need to do remotePrivateTask instead of call_async //if you are going to do more suspending ops (comms) inside the loop Grappa::spawnRemote<GCE>( target.core(), [key, target, f, this]() { Entry e; if (lookup_local( key, target.pointer(), &e)) { f(e.value); } }); }
void reset( ) { CHECK( !release_started_ || released_ ) << "inconsistent state for reset"; release_started_ = false; released_ = false; thread_ = NULL; num_messages_ = 0; response_count_ = 0; if( *count_ == 0 ) { DVLOG(5) << "Zero-length release"; release_started_ = true; released_ = true; } else if( request_address_->is_2D() ) { num_messages_ = 1; if( request_address_->core() == Grappa::mycore() ) { release_started_ = true; released_ = true; } } else { DVLOG(5) << "Straddle: block_max is " << (*request_address_ + *count_).block_max() ; DVLOG(5) << ", request_address is " << *request_address_; DVLOG(5) << ", sizeof(T) is " << sizeof(T); DVLOG(5) << ", count is " << *count_; DVLOG(5) << ", block_min is " << request_address_->block_min(); DVLOG(5) << "Straddle: address is " << *request_address_ ; DVLOG(5) << ", address + count is " << *request_address_ + *count_; ptrdiff_t byte_diff = ( (*request_address_ + *count_ - 1).last_byte().block_max() - request_address_->first_byte().block_min() ); DVLOG(5) << "Straddle: address block max is " << request_address_->block_max(); DVLOG(5) << " address + count block max is " << (*request_address_ + *count_).block_max(); DVLOG(5) << " address + count -1 block max is " << (*request_address_ + *count_ - 1).block_max(); DVLOG(5) << " difference is " << ( (*request_address_ + *count_ - 1).block_max() - request_address_->block_min() ); DVLOG(5) << " multiplied difference is " << ( (*request_address_ + *count_ - 1).block_max() - request_address_->block_min() ) * sizeof(T); DVLOG(5) << " address block min " << request_address_->block_min(); DVLOG(5) << "Straddle: diff is " << byte_diff << " bs " << block_size; num_messages_ = byte_diff / block_size; } if( num_messages_ > 1 ) DVLOG(5) << "****************************** MULTI BLOCK CACHE REQUEST ******************************"; DVLOG(5) << "Detecting straddle for sizeof(T):" << sizeof(T) << " count:" << *count_ << " num_messages_:" << num_messages_ << " request_address:" << *request_address_; }
bool compare_and_swap(GlobalAddress<T> target, U cmp_val, V new_val) { static_assert(std::is_convertible<T,U>(), "type of cmp_val must match GlobalAddress type"); static_assert(std::is_convertible<T,V>(), "type of new_val must match GlobalAddress type"); delegate_cmpswaps++; return call(target.core(), [target, cmp_val, new_val]() -> bool { T * p = target.pointer(); delegate_cmpswap_targets++; if (cmp_val == *p) { *p = new_val; return true; } else { return false; } }); }
T fetch_and_add( U inc ) { block_until_ready(); // fetch add unit is now aggregating so add my inc participant_count++; committed--; increment += inc; // if I'm the last entered client and either the flush threshold // is reached or there are no more committed participants then start the flush if ( ready_waiters == 0 && (participant_count >= flush_threshold || committed == 0 )) { set_not_ready(); uint64_t increment_total = increment; flat_combiner_fetch_and_add_amount += increment_total; auto t = target; result = call(target.core(), [t, increment_total]() -> U { T * p = t.pointer(); uint64_t r = *p; *p += increment_total; return r; }); // tell the others that the result has arrived Grappa::broadcast(&untilReceived); } else { // someone else will start the flush Grappa::wait(&untilReceived); } uint64_t my_start = result; result += inc; participant_count--; increment -= inc; // for validation purposes (could just set to 0) if ( participant_count == 0 ) { CHECK( increment == 0 ) << "increment = " << increment << " even though all participants are done"; set_ready(); } return my_start; }
bool lookup ( K key, V * val ) { uint64_t index = computeIndex( key ); GlobalAddress< Cell > target = base + index; // FIXME: remove 'this' capture when using gcc4.8, this is just a bug in 4.7 lookup_result result = Grappa::delegate::call( target.core(), [key,target,this]() { DHT_TYPE(lookup_result) lr; Entry e; if (lookup_local( key, target.pointer(), &e)) { lr.valid = true; lr.result = e.value; } return lr; }); *val = result.result; return result.valid; }
void bucket_sort(GlobalAddress<S> array, size_t nelems, int (*Scmp)(const void*,const void*), size_t (*lobits)(S,int), int log2buckets, int log2maxkey ) { double t, sort_time, histogram_time, allreduce_time, scatter_time, local_sort_scatter_time, put_back_time; int LOBITS = log2maxkey - log2buckets; size_t nbuckets = 1 << log2buckets; GlobalAddress<bucket_t<S> > bucketlist = Grappa::global_alloc<bucket_t<S> >(nbuckets); #ifdef DEBUG for (size_t i=0; i<nbuckets; i++) { GlobalAddress<bucket_t<S> > bi = bucketlist+i; VLOG(1) << "bucket[" << i << "] on Node " << bi.core() << ", offset = " << bi.pointer() - bucketlist.localize(bi.core()) << ", ptr = " << bi.pointer(); } #endif sort_time = Grappa::walltime(); // initialize globals and histogram counts // { setup_counts f(array, nbuckets, bucketlist); fork_join_custom(&f); } on_all_cores([nbuckets]{ counts.resize(nbuckets); offsets.resize(nbuckets); for (size_t i=0; i<nbuckets; i++) { counts[i] = 0; } }); t = Grappa::walltime(); // do local bucket counts // forall_local<uint64_t,histogram>(array, nelems); forall(array, nelems, [lobits,LOBITS](int64_t i, S& v) { size_t b = lobits(v, LOBITS); // TODO decide how to compare general in pieces counts[b]++; }); histogram_time = Grappa::walltime() - t; LOG(INFO) << "histogram_time: " << histogram_time; t = Grappa::walltime(); // allreduce everyone's counts & compute global offsets (prefix sum) // { aggregate_counts f; fork_join_custom(&f); } on_all_cores([nbuckets] { CHECK_EQ(nbuckets, counts.size()); // all nodes get total counts put into their counts array allreduce_inplace<size_t,collective_add>(&counts[0], counts.size()); VLOG(1) << "after allreduce_inplace (just in case)"; // everyone computes prefix sum over buckets locally offsets[0] = 0; for (size_t i=1; i<offsets.size(); i++) { offsets[i] = offsets[i-1] + counts[i-1]; } }); allreduce_time = Grappa::walltime() - t; LOG(INFO) << "allreduce_time: " << allreduce_time; // allocate space in buckets VLOG(3) << "allocating space..."; // forall_local<bucket_t,init_buckets>(bucketlist, nbuckets); forall(bucketlist, nbuckets, [](int64_t id, bucket_t<S>& bucket){ // (global malloc doesn't call constructors) new (&bucket) bucket_t<S>(); bucket.reserve(counts[id]); }); VLOG(3) << "scattering..."; t = Grappa::walltime(); // scatter into buckets // forall_local<uint64_t,scatter>(array, nelems); forall(array, nelems, [bucketlist,lobits,LOBITS](int64_t s, int64_t n, S * first){ size_t nbuckets = counts.size(); for (int i=0; i<n; i++) { auto v = first[i]; size_t b = lobits(v, LOBITS); CHECK( b < nbuckets ) << "bucket id = " << b << ", nbuckets = " << nbuckets; // ff_delegate<bucket_t,uint64_t,ff_append>(bucketlist+b, v); auto destb = bucketlist+b; delegate::call<async>(destb.core(), [destb,v]{ destb.pointer()->append(v); }); } }); scatter_time = Grappa::walltime() - t; LOG(INFO) << "scatter_time: " << scatter_time; t = Grappa::walltime(); // sort buckets locally // forall_local<bucket_t,sort_bucket>(bucketlist, nbuckets); /// Do some kind of local serial sort of a bucket forall(bucketlist, nbuckets, [Scmp](int64_t bucket_id, bucket_t<S>& bucket){ if (bucket.size() == 0) return; qsort(&bucket[0], bucket.size(), sizeof(S), Scmp); }); local_sort_scatter_time = Grappa::walltime() - t; LOG(INFO) << "local_sort_time: " << local_sort_scatter_time; t = Grappa::walltime(); // redistribute buckets back into global array // forall_local<bucket_t,put_back_bucket>(bucketlist, nbuckets); /// Redistribute sorted buckets back into global array forall(bucketlist, nbuckets, [array,bucketlist](int64_t b, bucket_t<S>& bucket) { const size_t NBUF = BUFSIZE / sizeof(S); DCHECK( b < counts.size() ); // TODO: shouldn't need to buffer this, but a bug of some sort is currently forcing us to limit the number of outstanding messages //for_buffered(i, n, 0, bucket.size(), NBUF) { // typename Incoherent<S>::WO c(array+offsets[b]+i, n, &bucket[i]); // c.block_until_released(); // } if ( bucket.size() > 0 ) { typename Incoherent<S>::WO c(array+offsets[b], bucket.size(), &bucket[0]); // FIXME v is not in locale-shared-memory c.block_until_released(); } VLOG(3) << "bucket[" << b << "] release successful"; }); put_back_time = Grappa::walltime() - t; LOG(INFO) << "put_back_time: " << put_back_time; sort_time = Grappa::walltime() - sort_time; LOG(INFO) << "total_sort_time: " << sort_time; }
void fill_remote(GlobalAddress<FullEmpty<T>> result_addr, const T& val) { send_heap_message(result_addr.core(), [result_addr,val]{ result_addr->writeXF(val); }); }
inline void enroll(GlobalAddress<CompletionEvent> ce, int64_t incr = 1) { impl::call(ce.core(), [ce,incr]{ ce->enroll(incr); }); }
bool GlobalQueue<T>::push( GlobalAddress<T> chunk_base, uint64_t chunk_amount ) { CHECK( initialized ); DVLOG(5) << "push() base:" << chunk_base << " amount:" << chunk_amount; GlobalAddress< QueueEntry<T> > loc = Grappa_delegate_func< bool, GlobalAddress< QueueEntry<T> >, GlobalQueue<T>::push_reserve_g > ( false, HOME_NODE ); size_t msg_bytes = Grappa_sizeof_delegate_func_request< bool, GlobalAddress< QueueEntry<T> > >( ); DVLOG(5) << "push() reserve done -- loc:" << loc; if ( loc.pointer() == NULL ) { Grappa::Metrics::global_queue_stats.record_push_reserve_request( msg_bytes, false ); // no space in global queue; push failed return false; } Grappa::Metrics::global_queue_stats.record_push_reserve_request( msg_bytes, true ); // push the queue entry that points to my chunk ChunkInfo<T> c; c.base = chunk_base; c.amount = chunk_amount; push_entry_args<T> entry_args; entry_args.target = loc; entry_args.chunk = c; DVLOG(5) << "push() sending entry to " << loc; bool had_sleeper = Grappa_delegate_func< push_entry_args<T>, bool, GlobalQueue<T>::push_entry_g > ( entry_args, loc.core() ); size_t entry_msg_bytes = Grappa_sizeof_delegate_func_request< push_entry_args<T>, bool >( ); Grappa::Metrics::global_queue_stats.record_push_entry_request( entry_msg_bytes, had_sleeper ); return true; }