Exemplo n.º 1
0
 T readFF(GlobalAddress<FullEmpty<T>> fe_addr) {
   if (fe_addr.core() == mycore()) {
     DVLOG(2) << "local";
     return fe_addr.pointer()->readFF();
   }
   
   FullEmpty<T> result;
   auto result_addr = make_global(&result);
   
   send_message(fe_addr.core(), [fe_addr,result_addr]{
     auto& fe = *fe_addr.pointer();
     
     if (fe.full()) {
       // DVLOG(2) << "no need to block";
       fill_remote(result_addr, fe.readFF());
       return;
     }
     
     DVLOG(2) << "setting up to block (" << fe_addr << ")";
     auto* c = SuspendedDelegate::create([&fe,result_addr]{
       VLOG(0) << "suspended_delegate!";
       fill_remote(result_addr, fe.readFF());
     });
     add_waiter(&fe, c);
   });
   
   return result.readFF();
 }
Exemplo n.º 2
0
 void increment(GlobalAddress<T> target, U inc) {
   static_assert(std::is_convertible<T,U>(), "type of inc must match GlobalAddress type");
   delegate_async_increments++;
   delegate::call<SyncMode::Async,C>(target.core(), [target,inc]{
     (*target.pointer()) += inc;
   });
 }
Exemplo n.º 3
0
    void update( K key, UV val ) {
      uint64_t index = computeIndex( key );
      GlobalAddress< Cell > target = base + index; 

      Grappa::delegate::call( target.core(), [key, val, target]() {   // TODO: upgrade to call_async; using GCE
        // list of entries in this cell
        std::list<DHT_TYPE(Entry)> * entries = target.pointer()->entries;

        // if first time the cell is hit then initialize
        if ( entries == NULL ) {
          entries = new std::list<Entry>();
          target.pointer()->entries = entries;
        }

        // find matching key in the list
        typename std::list<DHT_TYPE(Entry)>::iterator i;
        for (i = entries->begin(); i!=entries->end(); ++i) {
          if ( i->key == key ) {
            // key found so update
            i->value = UpF(i->value, val);
            hash_tables_size+=1;
            return 0;
          }
        }

        // this is the first time the key has been seen
        // so add it to the list
        Entry newe( key, UpF(Init, val));

        return 0; 
      });
    }
Exemplo n.º 4
0
 T read(GlobalAddress<T> target) {
   delegate_reads++;
   return call<S,C>(target.core(), [target]() -> T {
     delegate_read_targets++;
     return *target.pointer();
   });
 }
Exemplo n.º 5
0
 inline void lock( GlobalAddress<Mutex> m ) {
   // if local, just acquire
   if( m.core() == mycore() ) {
     lock( m.pointer() );
   } else { // if remote, spawn a task on the home node to acquire 
     CHECK(false);
   }
 }
Exemplo n.º 6
0
 /// Overload to work on GlobalAddresses.
 // template<typename CompletionType>
 // inline void complete(GlobalAddress<CompletionType> ce, int64_t decr = 1) {
   // static_assert(std::is_base_of<CompletionEvent,CompletionType>::value,
   //               "complete() can only be called on subclasses of CompletionEvent");
 inline void complete(GlobalAddress<CompletionEvent> ce, int64_t decr = 1) {
   DVLOG(5) << "complete CompletionEvent";
   if (ce.core() == mycore()) {
     ce.pointer()->complete(decr);
   } else {
     ce_remote_completions += decr;
     if (decr == 1) {
       // (common case) don't send full 8 bytes just to decrement by 1
       send_heap_message(ce.core(), [ce] {
         ce.pointer()->complete();
       });
     } else {
       send_heap_message(ce.core(), [ce,decr] {
         ce.pointer()->complete(decr);
       });
     }
   }
 }
Exemplo n.º 7
0
 void write(GlobalAddress<T> target, U value) {
   static_assert(std::is_convertible<T,U>(), "type of value must match GlobalAddress type");
   delegate_writes++;
   // TODO: don't return any val, requires changes to `delegate::call()`.
   return call<S,C>(target.core(), [target, value] {
     delegate_write_targets++;
     *target.pointer() = value;
   });
 }
Exemplo n.º 8
0
 T fetch_and_add(GlobalAddress<T> target, U inc) {
   delegate_fetchadds++;
   return call(target.core(), [target, inc]() -> T {
     delegate_fetchadd_targets++;
     T* p = target.pointer();
     T r = *p;
     *p += inc;
     return r;
   });
 }
Exemplo n.º 9
0
  void reset( ) {
    DVLOG(5) << "In " << __PRETTY_FUNCTION__;
    CHECK( !acquire_started_ || acquired_ ) << "inconsistent state for reset";
    acquire_started_ = false;
    acquired_ = false;
    thread_ = NULL;
    num_messages_ = 0;
    response_count_ = 0;
    expected_reply_payload_ = sizeof( T ) * *count_;
    total_reply_payload_ = 0;
    start_time_ = 0;
    network_time_ = 0;
    if( *count_ == 0 ) {
      DVLOG(5) << "Zero-length acquire";
      *pointer_ = NULL;
      acquire_started_ = true;
      acquired_ = true;
    } else if( request_address_->is_2D() ) {
      num_messages_ = 1;
      if( request_address_->core() == Grappa::mycore() ) {
        DVLOG(5) << "Short-circuiting to address " << request_address_->pointer();
        *pointer_ = request_address_->pointer();
        acquire_started_ = true;
        acquired_ = true;
      }
    } else {
      DVLOG(5) << "Straddle: block_max is " << (*request_address_ + *count_).block_max() ;
      DVLOG(5) << ", request_address is " << *request_address_;
      DVLOG(5) << ", sizeof(T) is " << sizeof(T);
      DVLOG(5) << ", count is " << *count_;
      DVLOG(5) << ", block_min is " << request_address_->block_min();


      DVLOG(5) << "Straddle: address is " << *request_address_ ;
      DVLOG(5) << ", address + count is " << *request_address_ + *count_;

      ptrdiff_t byte_diff = ( (*request_address_ + *count_ - 1).last_byte().block_max() - 
                               request_address_->first_byte().block_min() );

      DVLOG(5) << "Straddle: address block max is " << request_address_->block_max();
      DVLOG(5) << " address + count block max is " << (*request_address_ + *count_).block_max();
      DVLOG(5) << " address block min " << request_address_->block_min();
      DVLOG(5) << "Straddle: diff is " << byte_diff << " bs " << block_size;
      num_messages_ = byte_diff / block_size;
    }

    if( num_messages_ > 1 ) DVLOG(5) << "****************************** MULTI BLOCK CACHE REQUEST ******************************";

    DVLOG(5) << "In " << __PRETTY_FUNCTION__ << "; detecting straddle for sizeof(T):" << sizeof(T)
             << " count:" << *count_
             << " num_messages_:" << num_messages_
             << " request_address:" << *request_address_;
  }
Exemplo n.º 10
0
 NullAcquirer(GlobalAddress<T> * request_address, size_t * count, T** pointer)
 : request_address_(request_address), count_(count), pointer_(pointer)
 {
   VLOG(6) << "pointer = " << pointer << ", pointer_ = " << pointer_;
   if( count == 0 ) {
     DVLOG(5) << "Zero-length acquire";
     *pointer_ = NULL;
   } else if( request_address_->is_2D() && request_address_->core() == Grappa::mycore() ) {
     DVLOG(5) << "Short-circuiting to address " << request_address_->pointer();
     *pointer_ = request_address_->pointer();
   }
 }
Exemplo n.º 11
0
    void lookup ( K key, CF f ) {
      uint64_t index = computeIndex( key );
      GlobalAddress< Cell > target = base + index; 

      // FIXME: remove 'this' capture when using gcc4.8, this is just a bug in 4.7
      //TODO optimization where only need to do remotePrivateTask instead of call_async
      //if you are going to do more suspending ops (comms) inside the loop
      Grappa::spawnRemote<GCE>( target.core(), [key, target, f, this]() {
        Entry e;
        if (lookup_local( key, target.pointer(), &e)) {
          f(e.value);
        }
      });
    }
Exemplo n.º 12
0
  void reset( ) {
    CHECK( !release_started_ || released_ ) << "inconsistent state for reset";
    release_started_ = false;
    released_ = false;
    thread_ = NULL;
    num_messages_ = 0;
    response_count_ = 0;
    if( *count_ == 0 ) {
      DVLOG(5) << "Zero-length release";
      release_started_ = true;
      released_ = true;
    } else if( request_address_->is_2D() ) {
      num_messages_ = 1;
      if( request_address_->core() == Grappa::mycore() ) {
        release_started_ = true;
        released_ = true;
      }
    } else {
      DVLOG(5) << "Straddle: block_max is " << (*request_address_ + *count_).block_max() ;
      DVLOG(5) << ", request_address is " << *request_address_;
      DVLOG(5) << ", sizeof(T) is " << sizeof(T);
      DVLOG(5) << ", count is " << *count_;
      DVLOG(5) << ", block_min is " << request_address_->block_min();


      DVLOG(5) << "Straddle: address is " << *request_address_ ;
      DVLOG(5) << ", address + count is " << *request_address_ + *count_;

      ptrdiff_t byte_diff = ( (*request_address_ + *count_ - 1).last_byte().block_max() - 
                               request_address_->first_byte().block_min() );

      DVLOG(5) << "Straddle: address block max is " << request_address_->block_max();
      DVLOG(5) << " address + count block max is " << (*request_address_ + *count_).block_max();
      DVLOG(5) << " address + count -1 block max is " << (*request_address_ + *count_ - 1).block_max();
      DVLOG(5) << " difference is " << ( (*request_address_ + *count_ - 1).block_max() - request_address_->block_min() );
      DVLOG(5) << " multiplied difference is " << ( (*request_address_ + *count_ - 1).block_max() - request_address_->block_min() ) * sizeof(T);
      DVLOG(5) << " address block min " << request_address_->block_min();
      DVLOG(5) << "Straddle: diff is " << byte_diff << " bs " << block_size;
      num_messages_ = byte_diff / block_size;
    }

    if( num_messages_ > 1 ) DVLOG(5) << "****************************** MULTI BLOCK CACHE REQUEST ******************************";

    DVLOG(5) << "Detecting straddle for sizeof(T):" << sizeof(T)
             << " count:" << *count_
             << " num_messages_:" << num_messages_
             << " request_address:" << *request_address_;
  }
Exemplo n.º 13
0
 bool compare_and_swap(GlobalAddress<T> target, U cmp_val, V new_val) {
   static_assert(std::is_convertible<T,U>(), "type of cmp_val must match GlobalAddress type");
   static_assert(std::is_convertible<T,V>(), "type of new_val must match GlobalAddress type");
   
   delegate_cmpswaps++;
   return call(target.core(), [target, cmp_val, new_val]() -> bool {
     T * p = target.pointer();
     delegate_cmpswap_targets++;
     if (cmp_val == *p) {
       *p = new_val;
       return true;
     } else {
       return false;
     }
   });
 }
Exemplo n.º 14
0
        T fetch_and_add( U inc ) {

          block_until_ready();

          // fetch add unit is now aggregating so add my inc

          participant_count++;
          committed--;
          increment += inc;
        
          // if I'm the last entered client and either the flush threshold
          // is reached or there are no more committed participants then start the flush 
          if ( ready_waiters == 0 && (participant_count >= flush_threshold || committed == 0 )) {
            set_not_ready();
            uint64_t increment_total = increment;
            flat_combiner_fetch_and_add_amount += increment_total;
            auto t = target;
            result = call(target.core(), [t, increment_total]() -> U {
              T * p = t.pointer();
              uint64_t r = *p;
              *p += increment_total;
              return r;
            });
            // tell the others that the result has arrived
            Grappa::broadcast(&untilReceived);
          } else {
            // someone else will start the flush
            Grappa::wait(&untilReceived);
          }

          uint64_t my_start = result;
          result += inc;
          participant_count--;
          increment -= inc;   // for validation purposes (could just set to 0)
          if ( participant_count == 0 ) {
            CHECK( increment == 0 ) << "increment = " << increment << " even though all participants are done";
            set_ready();
          }

          return my_start;
        }
Exemplo n.º 15
0
    bool lookup ( K key, V * val ) {          
      uint64_t index = computeIndex( key );
      GlobalAddress< Cell > target = base + index; 

      // FIXME: remove 'this' capture when using gcc4.8, this is just a bug in 4.7
      lookup_result result = Grappa::delegate::call( target.core(), [key,target,this]() {

        DHT_TYPE(lookup_result) lr;

        Entry e;
        if (lookup_local( key, target.pointer(), &e)) {
          lr.valid = true;
          lr.result = e.value;
        }

        return lr;
      });

      *val = result.result;
      return result.valid;
    } 
Exemplo n.º 16
0
void bucket_sort(GlobalAddress<S> array, size_t nelems, int (*Scmp)(const void*,const void*), size_t (*lobits)(S,int), int log2buckets, int log2maxkey ) {
  double t, sort_time, histogram_time, allreduce_time, scatter_time, local_sort_scatter_time, put_back_time;

  int LOBITS = log2maxkey - log2buckets;
  size_t nbuckets = 1 << log2buckets;

  GlobalAddress<bucket_t<S> > bucketlist = Grappa::global_alloc<bucket_t<S> >(nbuckets);

#ifdef DEBUG
  for (size_t i=0; i<nbuckets; i++) {
    GlobalAddress<bucket_t<S> > bi = bucketlist+i;
    VLOG(1) << "bucket[" << i << "] on Node " << bi.core() << ", offset = " << bi.pointer() - bucketlist.localize(bi.core()) << ", ptr = " << bi.pointer();
  }
#endif

  sort_time = Grappa::walltime();

  // initialize globals and histogram counts
  // { setup_counts f(array, nbuckets, bucketlist); fork_join_custom(&f); }
  on_all_cores([nbuckets]{
    counts.resize(nbuckets);
    offsets.resize(nbuckets);
    for (size_t i=0; i<nbuckets; i++) {
      counts[i] = 0;
    }
  });

  t = Grappa::walltime();

  // do local bucket counts
  // forall_local<uint64_t,histogram>(array, nelems);
  forall(array, nelems, [lobits,LOBITS](int64_t i, S& v) {
    size_t b = lobits(v, LOBITS); // TODO decide how to compare general in pieces
    counts[b]++;
  });

  histogram_time = Grappa::walltime() - t;
  LOG(INFO) << "histogram_time: " << histogram_time;
  t = Grappa::walltime();

  // allreduce everyone's counts & compute global offsets (prefix sum)
  // { aggregate_counts f; fork_join_custom(&f); }
  on_all_cores([nbuckets] {
    CHECK_EQ(nbuckets, counts.size());
    // all nodes get total counts put into their counts array
    allreduce_inplace<size_t,collective_add>(&counts[0], counts.size());

    VLOG(1) << "after allreduce_inplace (just in case)";

    // everyone computes prefix sum over buckets locally
    offsets[0] = 0;
    for (size_t i=1; i<offsets.size(); i++) {
      offsets[i] = offsets[i-1] + counts[i-1];
    }
  });
  
  allreduce_time = Grappa::walltime() - t;
  LOG(INFO) << "allreduce_time: " << allreduce_time;

  // allocate space in buckets
  VLOG(3) << "allocating space...";
  // forall_local<bucket_t,init_buckets>(bucketlist, nbuckets);
  forall(bucketlist, nbuckets, [](int64_t id, bucket_t<S>& bucket){
    // (global malloc doesn't call constructors)
    new (&bucket) bucket_t<S>();
    bucket.reserve(counts[id]);
  });
  
  VLOG(3) << "scattering...";
      t = Grappa::walltime();

  // scatter into buckets
  // forall_local<uint64_t,scatter>(array, nelems);
  forall(array, nelems, [bucketlist,lobits,LOBITS](int64_t s, int64_t n, S * first){
    size_t nbuckets = counts.size();
    
    for (int i=0; i<n; i++) {
      auto v = first[i];
      size_t b = lobits(v, LOBITS);
      CHECK( b < nbuckets ) << "bucket id = " << b << ", nbuckets = " << nbuckets;
      // ff_delegate<bucket_t,uint64_t,ff_append>(bucketlist+b, v);
      auto destb = bucketlist+b;
      delegate::call<async>(destb.core(), [destb,v]{
        destb.pointer()->append(v);
      });
    }
  });
    
  scatter_time = Grappa::walltime() - t;
  LOG(INFO) << "scatter_time: " << scatter_time;
  t = Grappa::walltime();

  // sort buckets locally
  // forall_local<bucket_t,sort_bucket>(bucketlist, nbuckets);
  /// Do some kind of local serial sort of a bucket
  forall(bucketlist, nbuckets, [Scmp](int64_t bucket_id, bucket_t<S>& bucket){
    if (bucket.size() == 0) return;
    qsort(&bucket[0], bucket.size(), sizeof(S), Scmp);
  });

  local_sort_scatter_time = Grappa::walltime() - t;
  LOG(INFO) << "local_sort_time: " << local_sort_scatter_time;  
  t = Grappa::walltime(); 
  
  // redistribute buckets back into global array  
  // forall_local<bucket_t,put_back_bucket>(bucketlist, nbuckets);
  /// Redistribute sorted buckets back into global array
  forall(bucketlist, nbuckets, [array,bucketlist](int64_t b, bucket_t<S>& bucket) {
    const size_t NBUF = BUFSIZE / sizeof(S);
    DCHECK( b < counts.size() );

    // TODO: shouldn't need to buffer this, but a bug of some sort is currently forcing us to limit the number of outstanding messages    
    //for_buffered(i, n, 0, bucket.size(), NBUF) {
    //  typename Incoherent<S>::WO c(array+offsets[b]+i, n, &bucket[i]);
    //  c.block_until_released();
    // }
    
    if ( bucket.size() > 0 ) {
      typename Incoherent<S>::WO c(array+offsets[b], bucket.size(), &bucket[0]); // FIXME v is not in locale-shared-memory
      c.block_until_released();
    }

    VLOG(3) << "bucket[" << b << "] release successful";
  });
  
  put_back_time = Grappa::walltime() - t;
  LOG(INFO) << "put_back_time: " << put_back_time;
  
  sort_time = Grappa::walltime() - sort_time;
  LOG(INFO) << "total_sort_time: " << sort_time;
}
Exemplo n.º 17
0
 void fill_remote(GlobalAddress<FullEmpty<T>> result_addr, const T& val) {
   send_heap_message(result_addr.core(), [result_addr,val]{
     result_addr->writeXF(val);
   });
 }
Exemplo n.º 18
0
 inline void enroll(GlobalAddress<CompletionEvent> ce, int64_t incr = 1) {
   impl::call(ce.core(), [ce,incr]{ ce->enroll(incr); });
 }
Exemplo n.º 19
0
bool GlobalQueue<T>::push( GlobalAddress<T> chunk_base, uint64_t chunk_amount ) {
  CHECK( initialized );
  DVLOG(5) << "push() base:" << chunk_base << " amount:" << chunk_amount;

  GlobalAddress< QueueEntry<T> > loc = Grappa_delegate_func< bool, GlobalAddress< QueueEntry<T> >, GlobalQueue<T>::push_reserve_g > ( false, HOME_NODE );
  size_t msg_bytes = Grappa_sizeof_delegate_func_request< bool, GlobalAddress< QueueEntry<T> > >( );

  DVLOG(5) << "push() reserve done -- loc:" << loc;

  if ( loc.pointer() == NULL ) {
    Grappa::Metrics::global_queue_stats.record_push_reserve_request( msg_bytes, false );
    // no space in global queue; push failed
    return false;
  }

  Grappa::Metrics::global_queue_stats.record_push_reserve_request( msg_bytes, true );

  // push the queue entry that points to my chunk 
  ChunkInfo<T> c;
  c.base = chunk_base;
  c.amount = chunk_amount;
  push_entry_args<T> entry_args;
  entry_args.target = loc;
  entry_args.chunk = c;
  DVLOG(5) << "push() sending entry to " << loc;
  bool had_sleeper = Grappa_delegate_func< push_entry_args<T>, bool, GlobalQueue<T>::push_entry_g > ( entry_args, loc.core() ); 
  size_t entry_msg_bytes = Grappa_sizeof_delegate_func_request< push_entry_args<T>, bool >( );
  Grappa::Metrics::global_queue_stats.record_push_entry_request( entry_msg_bytes, had_sleeper );

  return true;
}