Beispiel #1
0
 T read(GlobalAddress<T> target) {
   delegate_reads++;
   return call<S,C>(target.core(), [target]() -> T {
     delegate_read_targets++;
     return *target.pointer();
   });
 }
Beispiel #2
0
bool GlobalQueue<T>::push( GlobalAddress<T> chunk_base, uint64_t chunk_amount ) {
  CHECK( initialized );
  DVLOG(5) << "push() base:" << chunk_base << " amount:" << chunk_amount;

  GlobalAddress< QueueEntry<T> > loc = Grappa_delegate_func< bool, GlobalAddress< QueueEntry<T> >, GlobalQueue<T>::push_reserve_g > ( false, HOME_NODE );
  size_t msg_bytes = Grappa_sizeof_delegate_func_request< bool, GlobalAddress< QueueEntry<T> > >( );

  DVLOG(5) << "push() reserve done -- loc:" << loc;

  if ( loc.pointer() == NULL ) {
    Grappa::Metrics::global_queue_stats.record_push_reserve_request( msg_bytes, false );
    // no space in global queue; push failed
    return false;
  }

  Grappa::Metrics::global_queue_stats.record_push_reserve_request( msg_bytes, true );

  // push the queue entry that points to my chunk 
  ChunkInfo<T> c;
  c.base = chunk_base;
  c.amount = chunk_amount;
  push_entry_args<T> entry_args;
  entry_args.target = loc;
  entry_args.chunk = c;
  DVLOG(5) << "push() sending entry to " << loc;
  bool had_sleeper = Grappa_delegate_func< push_entry_args<T>, bool, GlobalQueue<T>::push_entry_g > ( entry_args, loc.core() ); 
  size_t entry_msg_bytes = Grappa_sizeof_delegate_func_request< push_entry_args<T>, bool >( );
  Grappa::Metrics::global_queue_stats.record_push_entry_request( entry_msg_bytes, had_sleeper );

  return true;
}
Beispiel #3
0
    void update( K key, UV val ) {
      uint64_t index = computeIndex( key );
      GlobalAddress< Cell > target = base + index; 

      Grappa::delegate::call( target.core(), [key, val, target]() {   // TODO: upgrade to call_async; using GCE
        // list of entries in this cell
        std::list<DHT_TYPE(Entry)> * entries = target.pointer()->entries;

        // if first time the cell is hit then initialize
        if ( entries == NULL ) {
          entries = new std::list<Entry>();
          target.pointer()->entries = entries;
        }

        // find matching key in the list
        typename std::list<DHT_TYPE(Entry)>::iterator i;
        for (i = entries->begin(); i!=entries->end(); ++i) {
          if ( i->key == key ) {
            // key found so update
            i->value = UpF(i->value, val);
            hash_tables_size+=1;
            return 0;
          }
        }

        // this is the first time the key has been seen
        // so add it to the list
        Entry newe( key, UpF(Init, val));

        return 0; 
      });
    }
Beispiel #4
0
 T readFF(GlobalAddress<FullEmpty<T>> fe_addr) {
   if (fe_addr.core() == mycore()) {
     DVLOG(2) << "local";
     return fe_addr.pointer()->readFF();
   }
   
   FullEmpty<T> result;
   auto result_addr = make_global(&result);
   
   send_message(fe_addr.core(), [fe_addr,result_addr]{
     auto& fe = *fe_addr.pointer();
     
     if (fe.full()) {
       // DVLOG(2) << "no need to block";
       fill_remote(result_addr, fe.readFF());
       return;
     }
     
     DVLOG(2) << "setting up to block (" << fe_addr << ")";
     auto* c = SuspendedDelegate::create([&fe,result_addr]{
       VLOG(0) << "suspended_delegate!";
       fill_remote(result_addr, fe.readFF());
     });
     add_waiter(&fe, c);
   });
   
   return result.readFF();
 }
Beispiel #5
0
 void increment(GlobalAddress<T> target, U inc) {
   static_assert(std::is_convertible<T,U>(), "type of inc must match GlobalAddress type");
   delegate_async_increments++;
   delegate::call<SyncMode::Async,C>(target.core(), [target,inc]{
     (*target.pointer()) += inc;
   });
 }
Beispiel #6
0
 inline void lock( GlobalAddress<Mutex> m ) {
   // if local, just acquire
   if( m.core() == mycore() ) {
     lock( m.pointer() );
   } else { // if remote, spawn a task on the home node to acquire 
     CHECK(false);
   }
 }
Beispiel #7
0
 void write(GlobalAddress<T> target, U value) {
   static_assert(std::is_convertible<T,U>(), "type of value must match GlobalAddress type");
   delegate_writes++;
   // TODO: don't return any val, requires changes to `delegate::call()`.
   return call<S,C>(target.core(), [target, value] {
     delegate_write_targets++;
     *target.pointer() = value;
   });
 }
Beispiel #8
0
 T fetch_and_add(GlobalAddress<T> target, U inc) {
   delegate_fetchadds++;
   return call(target.core(), [target, inc]() -> T {
     delegate_fetchadd_targets++;
     T* p = target.pointer();
     T r = *p;
     *p += inc;
     return r;
   });
 }
Beispiel #9
0
 /// delegate malloc
 static GlobalAddress< void > remote_malloc( size_t size_bytes ) {
   // ask node 0 to allocate memory
   auto allocated_address = Grappa::impl::call( 0, [size_bytes] {
       DVLOG(5) << "got malloc request for size " << size_bytes;
       GlobalAddress< void > a = global_allocator->local_malloc( size_bytes );
       DVLOG(5) << "malloc returning pointer " << a.pointer();
       return a;
     });
   return allocated_address;
 }
Beispiel #10
0
 NullAcquirer(GlobalAddress<T> * request_address, size_t * count, T** pointer)
 : request_address_(request_address), count_(count), pointer_(pointer)
 {
   VLOG(6) << "pointer = " << pointer << ", pointer_ = " << pointer_;
   if( count == 0 ) {
     DVLOG(5) << "Zero-length acquire";
     *pointer_ = NULL;
   } else if( request_address_->is_2D() && request_address_->core() == Grappa::mycore() ) {
     DVLOG(5) << "Short-circuiting to address " << request_address_->pointer();
     *pointer_ = request_address_->pointer();
   }
 }
Beispiel #11
0
    void lookup ( K key, CF f ) {
      uint64_t index = computeIndex( key );
      GlobalAddress< Cell > target = base + index; 

      // FIXME: remove 'this' capture when using gcc4.8, this is just a bug in 4.7
      //TODO optimization where only need to do remotePrivateTask instead of call_async
      //if you are going to do more suspending ops (comms) inside the loop
      Grappa::spawnRemote<GCE>( target.core(), [key, target, f, this]() {
        Entry e;
        if (lookup_local( key, target.pointer(), &e)) {
          f(e.value);
        }
      });
    }
Beispiel #12
0
 bool compare_and_swap(GlobalAddress<T> target, U cmp_val, V new_val) {
   static_assert(std::is_convertible<T,U>(), "type of cmp_val must match GlobalAddress type");
   static_assert(std::is_convertible<T,V>(), "type of new_val must match GlobalAddress type");
   
   delegate_cmpswaps++;
   return call(target.core(), [target, cmp_val, new_val]() -> bool {
     T * p = target.pointer();
     delegate_cmpswap_targets++;
     if (cmp_val == *p) {
       *p = new_val;
       return true;
     } else {
       return false;
     }
   });
 }
Beispiel #13
0
double perf_test(GlobalAddress<GlobalVector<int64_t>> qa) {
  double t = Grappa::walltime();
  
  forall(0, FLAGS_nelems, [qa](int64_t i){
    if (EXP == Exp::QUEUE) {
      
      if (choose_random(FLAGS_fraction_push)) {
        qa->push(next_random<int64_t>());
      } else {
        qa->dequeue();
      }
      
    } else if (EXP == Exp::STACK) {
      
      if (choose_random(FLAGS_fraction_push)) {
        qa->push(next_random<int64_t>());
      } else {
        // qa->pop();
        CHECK_GT(qa->pop(), -1);
      }
      
    } else if (EXP == Exp::PUSH) {
      qa->push(next_random<int64_t>());
    } else if (EXP == Exp::POP) {
      qa->pop();
    } else if (EXP == Exp::DEQUEUE) {
      qa->dequeue();
    }
  });
  
  t = Grappa::walltime() - t;
  return t;
}
Beispiel #14
0
 void push(const T& o) {
     CHECK(target_array.pointer() != NULL) << "buffer not initialized!";
     buf[curr_buf][curr_size[curr_buf]] = o;
     curr_size[curr_buf]++;
     CHECK(curr_size[curr_buf] <= BUFSIZE);
     if (curr_size[curr_buf] == BUFSIZE) {
         flush();
     }
 }
  void reset( ) {
    DVLOG(5) << "In " << __PRETTY_FUNCTION__;
    CHECK( !acquire_started_ || acquired_ ) << "inconsistent state for reset";
    acquire_started_ = false;
    acquired_ = false;
    thread_ = NULL;
    num_messages_ = 0;
    response_count_ = 0;
    expected_reply_payload_ = sizeof( T ) * *count_;
    total_reply_payload_ = 0;
    start_time_ = 0;
    network_time_ = 0;
    if( *count_ == 0 ) {
      DVLOG(5) << "Zero-length acquire";
      *pointer_ = NULL;
      acquire_started_ = true;
      acquired_ = true;
    } else if( request_address_->is_2D() ) {
      num_messages_ = 1;
      if( request_address_->core() == Grappa::mycore() ) {
        DVLOG(5) << "Short-circuiting to address " << request_address_->pointer();
        *pointer_ = request_address_->pointer();
        acquire_started_ = true;
        acquired_ = true;
      }
    } else {
      DVLOG(5) << "Straddle: block_max is " << (*request_address_ + *count_).block_max() ;
      DVLOG(5) << ", request_address is " << *request_address_;
      DVLOG(5) << ", sizeof(T) is " << sizeof(T);
      DVLOG(5) << ", count is " << *count_;
      DVLOG(5) << ", block_min is " << request_address_->block_min();


      DVLOG(5) << "Straddle: address is " << *request_address_ ;
      DVLOG(5) << ", address + count is " << *request_address_ + *count_;

      ptrdiff_t byte_diff = ( (*request_address_ + *count_ - 1).last_byte().block_max() - 
                               request_address_->first_byte().block_min() );

      DVLOG(5) << "Straddle: address block max is " << request_address_->block_max();
      DVLOG(5) << " address + count block max is " << (*request_address_ + *count_).block_max();
      DVLOG(5) << " address block min " << request_address_->block_min();
      DVLOG(5) << "Straddle: diff is " << byte_diff << " bs " << block_size;
      num_messages_ = byte_diff / block_size;
    }

    if( num_messages_ > 1 ) DVLOG(5) << "****************************** MULTI BLOCK CACHE REQUEST ******************************";

    DVLOG(5) << "In " << __PRETTY_FUNCTION__ << "; detecting straddle for sizeof(T):" << sizeof(T)
             << " count:" << *count_
             << " num_messages_:" << num_messages_
             << " request_address:" << *request_address_;
  }
Beispiel #16
0
   T reduce( GlobalAddress<T> localizable ) {
     CompletionEvent ce(cores() - 1); 

     T total = *(localizable.localize());
     Core origin = mycore();

     for (Core c=0; c<cores(); c++) {
      if (c != origin) {
        send_heap_message(c, [localizable, &ce, &total, origin]{
          T val = *(localizable.localize());
          send_heap_message(origin, [val,&ce,&total] {
            total = ReduceOp(total, val);
            ce.complete();
          });
        });
      }
     }
    ce.wait();
    return total;
   }
Beispiel #17
0
 static double get_edge_weight(GlobalAddress<G> g, int64_t i, int64_t j) {
   return delegate::call(g->vs+i, [=](Vertex& v){ 
       for (int k = 0; v.nadj; k++) {
         auto e = g->edge(v,k);
         if (e.id == j)
           return e->weight;
       }
       // we can not reach this, possibly better to throw exception
       return 0.0;
   });
 }
Beispiel #18
0
    bool lookup ( K key, V * val ) {          
      uint64_t index = computeIndex( key );
      GlobalAddress< Cell > target = base + index; 

      // FIXME: remove 'this' capture when using gcc4.8, this is just a bug in 4.7
      lookup_result result = Grappa::delegate::call( target.core(), [key,target,this]() {

        DHT_TYPE(lookup_result) lr;

        Entry e;
        if (lookup_local( key, target.pointer(), &e)) {
          lr.valid = true;
          lr.result = e.value;
        }

        return lr;
      });

      *val = result.result;
      return result.valid;
    } 
Beispiel #19
0
  void do_release() {
    size_t total_bytes = *count_ * sizeof(T);
    
    RequestArgs args;
    args.request_address = *request_address_;
    DVLOG(5) << "Computing request_bytes from block_max " << request_address_->first_byte().block_max() << " and " << *request_address_;
    args.reply_address = make_global( this );
    size_t offset = 0;
    size_t request_bytes = 0;

    for( size_t i = 0;
         offset < total_bytes; 
         offset += request_bytes, i++) {

      request_bytes = args.request_address.first_byte().block_max() - args.request_address.first_byte();

      if( request_bytes > total_bytes - offset ) {
        request_bytes = total_bytes - offset;
      }

      DVLOG(5) << "sending release request with " << request_bytes
               << " of total bytes = " << *count_ * sizeof(T)
               << " to " << args.request_address;

      Grappa::send_heap_message(args.request_address.core(),
        [args](void * payload, size_t payload_size) {
          IRMetrics::count_release_ams( payload_size );
          DVLOG(5) << "Worker " << Grappa::current_worker()
          << " received release request to " << args.request_address
          << " reply to " << args.reply_address;
          memcpy( args.request_address.pointer(), payload, payload_size );
    
          auto reply_address = args.reply_address;
          Grappa::send_heap_message(args.reply_address.core(), [reply_address]{
            DVLOG(5) << "Worker " << Grappa::current_worker() << " received release reply to " << reply_address;
            reply_address.pointer()->release_reply();
          });
    
          DVLOG(5) << "Worker " << Grappa::current_worker()
          << " sent release reply to " << reply_address;
        },
        (char*)(*pointer_) + offset, request_bytes
      );

      // TODO: change type so we don't screw with pointer like this
      args.request_address = GlobalAddress<T>::Raw( args.request_address.raw_bits() + request_bytes );
    }
    DVLOG(5) << "release started for " << args.request_address;
    
    // blocks here waiting for messages to be sent
  }
Beispiel #20
0
        T fetch_and_add( U inc ) {

          block_until_ready();

          // fetch add unit is now aggregating so add my inc

          participant_count++;
          committed--;
          increment += inc;
        
          // if I'm the last entered client and either the flush threshold
          // is reached or there are no more committed participants then start the flush 
          if ( ready_waiters == 0 && (participant_count >= flush_threshold || committed == 0 )) {
            set_not_ready();
            uint64_t increment_total = increment;
            flat_combiner_fetch_and_add_amount += increment_total;
            auto t = target;
            result = call(target.core(), [t, increment_total]() -> U {
              T * p = t.pointer();
              uint64_t r = *p;
              *p += increment_total;
              return r;
            });
            // tell the others that the result has arrived
            Grappa::broadcast(&untilReceived);
          } else {
            // someone else will start the flush
            Grappa::wait(&untilReceived);
          }

          uint64_t my_start = result;
          result += inc;
          participant_count--;
          increment -= inc;   // for validation purposes (could just set to 0)
          if ( participant_count == 0 ) {
            CHECK( increment == 0 ) << "increment = " << increment << " even though all participants are done";
            set_ready();
          }

          return my_start;
        }
Beispiel #21
0
  void reset( ) {
    CHECK( !release_started_ || released_ ) << "inconsistent state for reset";
    release_started_ = false;
    released_ = false;
    thread_ = NULL;
    num_messages_ = 0;
    response_count_ = 0;
    if( *count_ == 0 ) {
      DVLOG(5) << "Zero-length release";
      release_started_ = true;
      released_ = true;
    } else if( request_address_->is_2D() ) {
      num_messages_ = 1;
      if( request_address_->core() == Grappa::mycore() ) {
        release_started_ = true;
        released_ = true;
      }
    } else {
      DVLOG(5) << "Straddle: block_max is " << (*request_address_ + *count_).block_max() ;
      DVLOG(5) << ", request_address is " << *request_address_;
      DVLOG(5) << ", sizeof(T) is " << sizeof(T);
      DVLOG(5) << ", count is " << *count_;
      DVLOG(5) << ", block_min is " << request_address_->block_min();


      DVLOG(5) << "Straddle: address is " << *request_address_ ;
      DVLOG(5) << ", address + count is " << *request_address_ + *count_;

      ptrdiff_t byte_diff = ( (*request_address_ + *count_ - 1).last_byte().block_max() - 
                               request_address_->first_byte().block_min() );

      DVLOG(5) << "Straddle: address block max is " << request_address_->block_max();
      DVLOG(5) << " address + count block max is " << (*request_address_ + *count_).block_max();
      DVLOG(5) << " address + count -1 block max is " << (*request_address_ + *count_ - 1).block_max();
      DVLOG(5) << " difference is " << ( (*request_address_ + *count_ - 1).block_max() - request_address_->block_min() );
      DVLOG(5) << " multiplied difference is " << ( (*request_address_ + *count_ - 1).block_max() - request_address_->block_min() ) * sizeof(T);
      DVLOG(5) << " address block min " << request_address_->block_min();
      DVLOG(5) << "Straddle: diff is " << byte_diff << " bs " << block_size;
      num_messages_ = byte_diff / block_size;
    }

    if( num_messages_ > 1 ) DVLOG(5) << "****************************** MULTI BLOCK CACHE REQUEST ******************************";

    DVLOG(5) << "Detecting straddle for sizeof(T):" << sizeof(T)
             << " count:" << *count_
             << " num_messages_:" << num_messages_
             << " request_address:" << *request_address_;
  }
Beispiel #22
0
 /// Overload to work on GlobalAddresses.
 // template<typename CompletionType>
 // inline void complete(GlobalAddress<CompletionType> ce, int64_t decr = 1) {
   // static_assert(std::is_base_of<CompletionEvent,CompletionType>::value,
   //               "complete() can only be called on subclasses of CompletionEvent");
 inline void complete(GlobalAddress<CompletionEvent> ce, int64_t decr = 1) {
   DVLOG(5) << "complete CompletionEvent";
   if (ce.core() == mycore()) {
     ce.pointer()->complete(decr);
   } else {
     ce_remote_completions += decr;
     if (decr == 1) {
       // (common case) don't send full 8 bytes just to decrement by 1
       send_heap_message(ce.core(), [ce] {
         ce.pointer()->complete();
       });
     } else {
       send_heap_message(ce.core(), [ce,decr] {
         ce.pointer()->complete(decr);
       });
     }
   }
 }
Beispiel #23
0
void activate(GlobalAddress<V> v) {
  delegate::call(v, [](V& v){ v->activate(); });
}
Beispiel #24
0
void bfs(GlobalAddress<G> _g, int nbfs, TupleGraph tg) {
  bool verified = false;
  double t;
      
  auto _frontier = GlobalBag<VertexID>::create(_g->nv);
  auto _next     = GlobalBag<VertexID>::create(_g->nv);
  call_on_all_cores([=]{ frontier = _frontier; next = _next; g = _g; });
    
  // do BFS from multiple different roots and average their times
  for (int root_idx = 0; root_idx < nbfs; root_idx++) {
  
    // intialize parent to -1
    forall(g, [](G::Vertex& v){ v->init(); v->level = -1; });
    
    VertexID root;
    if (FLAGS_max_degree_source) {
      forall(g, [](VertexID i, G::Vertex& v){
        max_degree << MaxDegree(i, v.nadj);
      });
      root = static_cast<MaxDegree>(max_degree).idx();
    } else {
      root = choose_root(g);
    }
    
    // setup 'root' as the parent of itself
    delegate::call(g->vs+root, [=](G::Vertex& v){
      v->parent = root;
      v->level = 0;
    });
    
    // reset frontier queues
    next->clear();
    frontier->clear();
    
    // start with root as only thing in frontier
    delegate::call((g->vs+root).core(), [=]{ frontier->add(root); });
    
    t = walltime();
    
    bool top_down = true;
    int64_t prev_nf = -1;
    int64_t frontier_edges = 0;
    int64_t remaining_edges = g->nadj;
    
    while (!frontier->empty()) {
      
      auto nf = frontier->size();
      VLOG(1) << "remaining_edges = " << remaining_edges << ", nf = " << nf << ", prev_nf = " << prev_nf << ", frontier_edges: " ;
      if (top_down && frontier_edges > remaining_edges/FLAGS_beamer_alpha && nf > prev_nf) {
        VLOG(1) << "switching to bottom-up";
        top_down = false;
      } else if (!top_down && frontier_edges < g->nv/FLAGS_beamer_beta && nf < prev_nf) {
        VLOG(1) << "switching to top-down";
        top_down = true;
      }
      
      edge_count = 0;
      
      if (top_down) {
                
        // iterate over vertices in this level of the frontier
        forall(frontier, [](VertexID& i){
          // visit all the adjacencies of the vertex
          // note: this has to be 'async' to prevent deadlock from
          //       running out of available workers
          forall<async>(adj(g,i), [i](G::Edge& e) {
            auto j = e.id;
            // at the core where the vertex is...
            delegate::call<async>(e.ga, [i,j](G::Vertex& vj){
              // note: no synchronization needed because 'call' is 
              // guaranteed to be executed atomically because it 
              // does no blocking operations
              if (vj->parent == -1) {
                // claim parenthood
                vj->parent = i;
                vj->level = current_depth;
                next->add(j);
                edge_count += vj.nadj;
              }
            });
          });
        });
      } else { // bottom-up
        
        forall<&phaser>(g, [](G::Vertex& v){
          if (v->level != -1) return;
          auto va = make_linear(&v);
          forall<async,&phaser>(adj(g,v), [=,&v](G::Edge& e){
            if (v->level != -1) return;
            
            phaser.enroll();
            auto eva = e.ga;
            send_heap_message(eva.core(), [=]{
              auto& ev = *eva.pointer();
              if (ev->level != -1 && ev->level < current_depth) {
                auto eid = g->id(ev);
                send_heap_message(va.core(), [=]{
                  auto& v = *va.pointer();
                  if (v->level == -1) {
                    next->add(g->id(v));
                    v->level = current_depth;
                    v->parent = eid;
                    edge_count += v.nadj;
                  }
                  phaser.complete();
                });
              } else {
                phaser.send_completion(va.core());
              }
            });
          });
        });
      }
      
      call_on_all_cores([=]{
        current_depth++;
        // switch to next frontier level
        std::swap(frontier, next);
      });
      next->clear();
      frontier_edges = edge_count;
      remaining_edges -= frontier_edges;
      prev_nf = nf;
    } // while (frontier not empty)
    
    double this_bfs_time = walltime() - t;
    LOG(INFO) << "(root=" << root << ", time=" << this_bfs_time << ")";
    
    if (!verified) {
      // only verify the first one to save time
      t = walltime();
      bfs_nedge = verify(tg, g, root);
      verify_time = (walltime()-t);
      LOG(INFO) << verify_time;
      verified = true;
      Metrics::reset_all_cores(); // don't count the first one
    } else {
      total_time += this_bfs_time;
    }
    
    bfs_mteps += bfs_nedge / this_bfs_time / 1.0e6;
  }
}
Beispiel #25
0
 void fill_remote(GlobalAddress<FullEmpty<T>> result_addr, const T& val) {
   send_heap_message(result_addr.core(), [result_addr,val]{
     result_addr->writeXF(val);
   });
 }
Beispiel #26
0
 inline void enroll(GlobalAddress<CompletionEvent> ce, int64_t incr = 1) {
   impl::call(ce.core(), [ce,incr]{ ce->enroll(incr); });
 }
  void do_acquire() {
    size_t total_bytes = *count_ * sizeof(T);
    RequestArgs args;
    args.request_address = *request_address_;
    DVLOG(5) << "Computing request_bytes from block_max " << request_address_->first_byte().block_max() << " and " << *request_address_;
    args.reply_address = make_global( this );
    args.offset = 0;  
    
    for(size_t i = 0;
         args.offset < total_bytes; 
         args.offset += args.request_bytes, i++) {

      args.request_bytes = args.request_address.first_byte().block_max() - args.request_address.first_byte();


      if( args.request_bytes > total_bytes - args.offset ) {
        args.request_bytes = total_bytes - args.offset;
      }

      DVLOG(5) << "sending acquire request for " << args.request_bytes
               << " of total bytes = " << *count_ * sizeof(T)
               << " from " << args.request_address;

      Grappa::send_heap_message(args.request_address.core(), [args]{
        IAMetrics::count_acquire_ams( args.request_bytes );
        DVLOG(5) << "Worker " << Grappa::current_worker()
        << " received acquire request to " << args.request_address
        << " size " << args.request_bytes
        << " offset " << args.offset
        << " reply to " << args.reply_address;
          
        DVLOG(5) << "Worker " << Grappa::current_worker()
        << " sending acquire reply to " << args.reply_address
        << " offset " << args.offset
        << " request address " << args.request_address
        << " payload address " << args.request_address.pointer()
        << " payload size " << args.request_bytes;
          
        // note: this will read the payload *later* when the message is copied into the actual send buffer,
        // should be okay because we're already assuming DRF, but something to watch out for
        auto reply_address = args.reply_address;
        auto offset = args.offset;
          
        Grappa::send_heap_message(args.reply_address.core(),
          [reply_address, offset](void * payload, size_t payload_size) {
            DVLOG(5) << "Worker " << Grappa::current_worker()
            << " received acquire reply to " << reply_address
            << " offset " << offset
            << " payload size " << payload_size;
            reply_address.pointer()->acquire_reply( offset, payload, payload_size);
          },
          args.request_address.pointer(), args.request_bytes
        );
          
        DVLOG(5) << "Worker " << Grappa::current_worker()
        << " sent acquire reply to " << args.reply_address
        << " offset " << args.offset
        << " request address " << args.request_address
        << " payload address " << args.request_address.pointer()
        << " payload size " << args.request_bytes;
      });

      // TODO: change type so we don't screw with pointer like this
      args.request_address = GlobalAddress<T>::Raw( args.request_address.raw_bits() + args.request_bytes );
    }
    DVLOG(5) << "acquire started for " << args.request_address;      
  }
Beispiel #28
0
void bucket_sort(GlobalAddress<S> array, size_t nelems, int (*Scmp)(const void*,const void*), size_t (*lobits)(S,int), int log2buckets, int log2maxkey ) {
  double t, sort_time, histogram_time, allreduce_time, scatter_time, local_sort_scatter_time, put_back_time;

  int LOBITS = log2maxkey - log2buckets;
  size_t nbuckets = 1 << log2buckets;

  GlobalAddress<bucket_t<S> > bucketlist = Grappa::global_alloc<bucket_t<S> >(nbuckets);

#ifdef DEBUG
  for (size_t i=0; i<nbuckets; i++) {
    GlobalAddress<bucket_t<S> > bi = bucketlist+i;
    VLOG(1) << "bucket[" << i << "] on Node " << bi.core() << ", offset = " << bi.pointer() - bucketlist.localize(bi.core()) << ", ptr = " << bi.pointer();
  }
#endif

  sort_time = Grappa::walltime();

  // initialize globals and histogram counts
  // { setup_counts f(array, nbuckets, bucketlist); fork_join_custom(&f); }
  on_all_cores([nbuckets]{
    counts.resize(nbuckets);
    offsets.resize(nbuckets);
    for (size_t i=0; i<nbuckets; i++) {
      counts[i] = 0;
    }
  });

  t = Grappa::walltime();

  // do local bucket counts
  // forall_local<uint64_t,histogram>(array, nelems);
  forall(array, nelems, [lobits,LOBITS](int64_t i, S& v) {
    size_t b = lobits(v, LOBITS); // TODO decide how to compare general in pieces
    counts[b]++;
  });

  histogram_time = Grappa::walltime() - t;
  LOG(INFO) << "histogram_time: " << histogram_time;
  t = Grappa::walltime();

  // allreduce everyone's counts & compute global offsets (prefix sum)
  // { aggregate_counts f; fork_join_custom(&f); }
  on_all_cores([nbuckets] {
    CHECK_EQ(nbuckets, counts.size());
    // all nodes get total counts put into their counts array
    allreduce_inplace<size_t,collective_add>(&counts[0], counts.size());

    VLOG(1) << "after allreduce_inplace (just in case)";

    // everyone computes prefix sum over buckets locally
    offsets[0] = 0;
    for (size_t i=1; i<offsets.size(); i++) {
      offsets[i] = offsets[i-1] + counts[i-1];
    }
  });
  
  allreduce_time = Grappa::walltime() - t;
  LOG(INFO) << "allreduce_time: " << allreduce_time;

  // allocate space in buckets
  VLOG(3) << "allocating space...";
  // forall_local<bucket_t,init_buckets>(bucketlist, nbuckets);
  forall(bucketlist, nbuckets, [](int64_t id, bucket_t<S>& bucket){
    // (global malloc doesn't call constructors)
    new (&bucket) bucket_t<S>();
    bucket.reserve(counts[id]);
  });
  
  VLOG(3) << "scattering...";
      t = Grappa::walltime();

  // scatter into buckets
  // forall_local<uint64_t,scatter>(array, nelems);
  forall(array, nelems, [bucketlist,lobits,LOBITS](int64_t s, int64_t n, S * first){
    size_t nbuckets = counts.size();
    
    for (int i=0; i<n; i++) {
      auto v = first[i];
      size_t b = lobits(v, LOBITS);
      CHECK( b < nbuckets ) << "bucket id = " << b << ", nbuckets = " << nbuckets;
      // ff_delegate<bucket_t,uint64_t,ff_append>(bucketlist+b, v);
      auto destb = bucketlist+b;
      delegate::call<async>(destb.core(), [destb,v]{
        destb.pointer()->append(v);
      });
    }
  });
    
  scatter_time = Grappa::walltime() - t;
  LOG(INFO) << "scatter_time: " << scatter_time;
  t = Grappa::walltime();

  // sort buckets locally
  // forall_local<bucket_t,sort_bucket>(bucketlist, nbuckets);
  /// Do some kind of local serial sort of a bucket
  forall(bucketlist, nbuckets, [Scmp](int64_t bucket_id, bucket_t<S>& bucket){
    if (bucket.size() == 0) return;
    qsort(&bucket[0], bucket.size(), sizeof(S), Scmp);
  });

  local_sort_scatter_time = Grappa::walltime() - t;
  LOG(INFO) << "local_sort_time: " << local_sort_scatter_time;  
  t = Grappa::walltime(); 
  
  // redistribute buckets back into global array  
  // forall_local<bucket_t,put_back_bucket>(bucketlist, nbuckets);
  /// Redistribute sorted buckets back into global array
  forall(bucketlist, nbuckets, [array,bucketlist](int64_t b, bucket_t<S>& bucket) {
    const size_t NBUF = BUFSIZE / sizeof(S);
    DCHECK( b < counts.size() );

    // TODO: shouldn't need to buffer this, but a bug of some sort is currently forcing us to limit the number of outstanding messages    
    //for_buffered(i, n, 0, bucket.size(), NBUF) {
    //  typename Incoherent<S>::WO c(array+offsets[b]+i, n, &bucket[i]);
    //  c.block_until_released();
    // }
    
    if ( bucket.size() > 0 ) {
      typename Incoherent<S>::WO c(array+offsets[b], bucket.size(), &bucket[0]); // FIXME v is not in locale-shared-memory
      c.block_until_released();
    }

    VLOG(3) << "bucket[" << b << "] release successful";
  });
  
  put_back_time = Grappa::walltime() - t;
  LOG(INFO) << "put_back_time: " << put_back_time;
  
  sort_time = Grappa::walltime() - sort_time;
  LOG(INFO) << "total_sort_time: " << sort_time;
}
Beispiel #29
0
 // release data at pointer in local heap
 /// (should be called only on node responsible for allocator)
 void local_free( GlobalAddress< void > address ) {
   void * va = reinterpret_cast< void * >( address.raw_bits() );
   a_p_->free( va );
 }