void collect_reduction(const T& val) { static T total; static Core cores_in = 0; DCHECK(mycore() == HOME_CORE); if (cores_in == 0) { total = val; } else { total = ReduceOp(total, val); } cores_in++; DVLOG(4) << "cores_in: " << cores_in; if (cores_in == cores()) { cores_in = 0; T tmp_total = total; for (Core c = 0; c < cores(); c++) { send_heap_message(c, [tmp_total] { Reduction<T>::result.writeXF(tmp_total); }); } } }
static GlobalAddress<GlobalBag> create(size_t total_capacity) { auto self = symmetric_global_alloc<GlobalBag>(); auto n = total_capacity / cores() + total_capacity % cores(); call_on_all_cores([=]{ new (self.localize()) GlobalBag(self, n); }); return self; }
// Export to R RcppExport SEXP rflann_RadiusSearch(SEXP query_SEXP, SEXP ref_SEXP, SEXP radiusSEXP, SEXP max_neighbourSEXP, SEXP buildSEXP, SEXP coresSEXP, SEXP checksSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type query_(query_SEXP); Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type ref_(ref_SEXP); Rcpp::traits::input_parameter< double >::type radius(radiusSEXP); Rcpp::traits::input_parameter< int >::type max_neighbour(max_neighbourSEXP); Rcpp::traits::input_parameter< std::string >::type build(buildSEXP); Rcpp::traits::input_parameter< int >::type cores(coresSEXP); Rcpp::traits::input_parameter< int >::type checks(checksSEXP); __result = Rcpp::wrap(RadiusSearch(query_, ref_, radius, max_neighbour, build, cores, checks)); return __result; END_RCPP }
/// Mark a certain number of things completed. When the global count on all cores goes to 0, all /// tasks waiting on the GCE will be woken. /// /// Note: this can be called in a message handler (e.g. remote completes from stolen tasks). void complete(int64_t dec = 1) { count -= dec; DVLOG(4) << "complete (" << count << ") -- gce(" << this << ")"; // out of work here if (count == 0) { // count[dec -> 0] // enter cancellable barrier send_heap_message(master_core, [this] { cores_out--; DVLOG(4) << "core entered barrier (cores_out:"<< cores_out <<")"; // if all are in if (cores_out == 0) { // cores_out[1 -> 0] CHECK_EQ(count, 0); // notify everyone to wake for (Core c = 0; c < cores(); c++) { send_heap_message(c, [this] { CHECK_EQ(count, 0); DVLOG(3) << "broadcast"; broadcast(&cv); // wake anyone who was waiting here reset(); // reset, now anyone else calling `wait` should fall through }); } } }); } }
void call_on_all_cores(F work) { Core origin = mycore(); CompletionEvent ce(cores()-1); auto lsz = [&ce,origin,work]{}; MessagePool pool(cores()*(sizeof(Message<decltype(lsz)>))); for (Core c = 0; c < cores(); c++) if (c != mycore()) { pool.send_message(c, [&ce, origin, work] { work(); send_heap_message(origin, [&ce]{ ce.complete(); }); }); } work(); // do my core's work ce.wait(); }
void on_all_cores(F work) { CompletionEvent ce(cores()); auto ce_addr = make_global(&ce); auto lsz = [ce_addr,work]{}; MessagePool pool(cores()*(sizeof(Message<decltype(lsz)>))); for (Core c = 0; c < cores(); c++) { pool.send_message(c, [ce_addr, work] { spawn([ce_addr, work] { work(); complete(ce_addr); }); }); } ce.wait(); }
T reduce( GlobalAddress<P> localizable ) { CompletionEvent ce(cores() - 1); T total = Accessor(localizable); Core origin = mycore(); for (Core c=0; c<cores(); c++) { if (c != origin) { send_heap_message(c, [localizable, &ce, &total, origin]{ T val = Accessor(localizable); send_heap_message(origin, [val,&ce,&total] { total = ReduceOp(total, val); ce.complete(); }); }); } } ce.wait(); return total; }
GlobalAddress<T> symmetric_global_alloc() { static_assert(sizeof(T) % block_size == 0, "must pad global proxy to multiple of block_size, or use GRAPPA_BLOCK_ALIGNED"); // allocate enough space that we are guaranteed to get one on each core at same location auto qac = global_alloc<char>(cores()*(sizeof(T)+block_size)); while (qac.core() != MASTER_CORE) qac++; auto qa = static_cast<GlobalAddress<T>>(qac); CHECK_EQ(qa, qa.block_min()); CHECK_EQ(qa.core(), MASTER_CORE); return qa; }
T reduce(T * global_ptr) { //NOTE: this is written in a continuation passing //style to avoid the use of a GCE which async delegates only support CompletionEvent ce(cores()-1); // TODO: look into optionally stack-allocating pool storage like in IncoherentAcquirer. MessagePool pool(cores() * sizeof(Message<std::function<void(T*)>>)); T total = *global_ptr; Core origin = mycore(); for (Core c=0; c<cores(); c++) { if (c != origin) { pool.send_message(c, [global_ptr, &ce, &total, origin]{ T val = *global_ptr; send_heap_message(origin, [val,&ce,&total] { total = ReduceOp(total, val); ce.complete(); }); }); } } ce.wait(); return total; }
////////////////////////////////// // Main entry function of the GPU cache model ////////////////////////////////// int main(int argc, char** argv) { srand(time(0)); std::cout << SPLIT_STRING << std::endl; message(""); // Flush messages as soon as possible std::cout.setf(std::ios_base::unitbuf); // Read the hardware settings from file Settings hardware = get_settings(); // Print cache statistics message("Cache configuration:"); std::cout << "### \t Cache size: ~" << hardware.cache_bytes/1024 << "KB" << std::endl; std::cout << "### \t Line size: " << hardware.line_size << " bytes" << std::endl; std::cout << "### \t Layout: " << hardware.cache_ways << " ways, " << hardware.cache_sets << " sets" << std::endl; message(""); // Parse the input argument and make sure that there is only one if (argc != 3) { message("Error: provide one argument only (a folder containing input trace files)"); message(""); std::cout << SPLIT_STRING << std::endl; exit(1); } std::string benchname = argv[1]; std::string suitename = argv[2]; // Loop over all found traces in the folder (one trace per kernel) for (unsigned kernel_id = 0; kernel_id < 20; kernel_id++) { std::vector<Thread> threads(MAX_THREADS); for (unsigned t=0; t<MAX_THREADS; t++) { threads[t] = Thread(); } // Set the kernelname and include a counter std::string kernelname; if (kernel_id < 10) { kernelname = benchname+"_0"+std::to_string(kernel_id); } else { kernelname = benchname+"_" +std::to_string(kernel_id); } // Load a memory access trace from a file Dim3 blockdim = read_file(threads, kernelname, benchname, suitename); unsigned blocksize = blockdim.x*blockdim.y*blockdim.z; // There was not a single trace that could be found - exit with an error if (blocksize == 0 && kernel_id == 0) { std::cout << "### Error: could not read file 'output/" << benchname << "/" << kernelname << ".trc'" << std::endl; message(""); std::cout << SPLIT_STRING << std::endl; exit(1); } // The final tracefile is already processed, exit the loop if (blocksize == 0) { break; } // Assign threads to warps, threadblocks and GPU cores message(""); std::cout << "### Assigning threads to warps/blocks/cores..."; unsigned num_blocks = ceil(threads.size()/(float)(blocksize)); unsigned num_warps_per_block = ceil(blocksize/(float)(hardware.warp_size)); std::vector<std::vector<unsigned>> warps(num_warps_per_block*num_blocks); std::vector<std::vector<unsigned>> blocks(num_blocks); std::vector<std::vector<unsigned>> cores(hardware.num_cores); schedule_threads(threads, warps, blocks, cores, hardware, blocksize); std::cout << "done" << std::endl; // Model only a single core, modelling multiple cores requires a loop over 'cid' unsigned cid = 0; // Compute the number of active blocks on this core unsigned hardware_max_active_blocks = std::min(hardware.max_active_threads/blocksize, hardware.max_active_blocks); unsigned active_blocks = std::min((unsigned)cores[cid].size(), hardware_max_active_blocks); // Start the computation of the reuse distance profile message(""); std::cout << "### [core " << cid << "]:" << std::endl; std::cout << "### Running " << active_blocks << " block(s) at a time" << std::endl; std::cout << "### Calculating the reuse distances"; // Create a Gaussian distribution to model memory latencies std::random_device random; std::mt19937 gen(random()); // Compute the reuse distance for 4 different cases std::vector<map_type<unsigned,unsigned>> distances(NUM_CASES); for (unsigned runs = 0; runs < NUM_CASES; runs++) { std::cout << "..."; unsigned sets, ways; unsigned ml, ms, nml; unsigned mshr; // CASE 0 | Normal - full model sets = hardware.cache_sets; ways = hardware.cache_ways; ml = hardware.mem_latency; ms = hardware.mem_latency_stddev; nml = NON_MEM_LATENCY; mshr = hardware.num_mshr; // CASE 1 | Only 1 set: don't model associativity if (runs == 1) { sets = 1; ways = hardware.cache_ways*hardware.cache_sets; } // CASE 2 | Memory latency to 0: don't model latencies if (runs == 2) { ml = 0; ms = 0; nml = 0; } // CASE 3 | MSHR count to infinite: don't model MSHRs if (runs == 3) { mshr = INF; } // Calculate the reuse distance profile std::normal_distribution<> distribution(0,ms); reuse_distance(cores[cid], blocks, warps, threads, distances[runs], active_blocks, hardware, sets, ways, ml, nml, mshr, gen, distribution); } std::cout << "done" << std::endl; // Process the reuse distance profile to obtain the cache hit/miss rate message(""); output_miss_rate(distances, kernelname, benchname,suitename, hardware); // Display the cache hit/miss rate from the output of the verifier (if available) message(""); verify_miss_rate(kernelname, benchname); message(""); } // End of the program std::cout << SPLIT_STRING << std::endl; return 0; }
int Graph::colore(int precision, bool cor) { int nCrom = this->n; std::vector<Vertex> verts(this->n); std::vector<int> cores(this->n); std::priority_queue<Vertex, std::vector<Vertex>, std::less<Vertex> > heaps; int k; for(k=0; k < precision; k++) { for(int j = 0; j < this->n; j++) { verts[j].value = rand() % vDeg[j]; verts[j].index = j+1; heaps.push(verts[j]); } Vertex v; int i; int l; bool ok = true; int crom = 0; //Vetor de filas, em que a i-ésima fila corresponde aos vértices que possuem a cor i std::vector< std::deque<int> > C(this->n); while (!heaps.empty()) { v = heaps.top(); heaps.pop(); i = 0; while (true) { for(std::deque<int>::iterator it = C[i].begin(); it < C[i].end(); it++) { for (std::vector<int>::iterator j = this->vec[v.index-1].begin(); j < this->vec[v.index-1].end(); j++) { if(*j == *it) { ok = false; break; //o loop interno } } if (!ok) { break; } } if(ok) { C[i].insert(C[i].end(),v.index); if (crom < i+1) { crom = i+1; } break; } i++; ok = true; } } if (nCrom > crom) { nCrom = crom; if (cor) { l = 0; while (true) { if (C[l].empty()) break; for (std::deque<int>::iterator pos = C[l].begin(); pos < C[l].end(); pos++) { cores[*pos-1] = l; } l++; } } } } if (cor) this->colors = cores; return nCrom; }
/// SPMD, must be called on static/file-global object on all cores /// blocks until reduction is complete void call_allreduce(T * in_array, size_t nelem) { // setup everything (block to make sure HOME_CORE is done) this->array = in_array; this->nelem = nelem; size_t n_per_msg = MAX_MESSAGE_SIZE / sizeof(T); size_t nmsg = nelem / n_per_msg + (nelem % n_per_msg ? 1 : 0); auto nmsg_total = nmsg*(cores()-1); CompletionEvent local_ce; this->ce = &local_ce; this->ce->enroll( (mycore() == HOME_CORE) ? nmsg_total : nmsg ); barrier(); if (mycore() != HOME_CORE) { for (size_t k=0; k<nelem; k+=n_per_msg) { size_t this_nelem = MIN(n_per_msg, nelem-k); // everyone sends their contribution to HOME_CORE, last one wakes HOME_CORE send_heap_message(HOME_CORE, [this,k](void * payload, size_t payload_size) { DCHECK(mycore() == HOME_CORE); auto in_array = static_cast<T*>(payload); auto in_n = payload_size/sizeof(T); auto total = this->array+k; for (size_t i=0; i<in_n; i++) { total[i] = ReduceOp(total[i], in_array[i]); } DVLOG(3) << "incrementing HOME sem, now at " << ce->get_count(); this->ce->complete(); }, (void*)(in_array+k), sizeof(T)*this_nelem); } DVLOG(3) << "about to block for " << nelem << " with sem == " << ce->get_count(); this->ce->wait(); } else { auto nmsg_total = nmsg*(cores()-1); // home core waits until woken by last received message from other cores this->ce->wait(); DVLOG(3) << "woke with sem == " << ce->get_count(); // send total to everyone else and wake them char msg_buf[(cores()-1)*sizeof(PayloadMessage<std::function<void(decltype(this),size_t)>>)]; MessagePool pool(msg_buf, sizeof(msg_buf)); for (Core c = 0; c < cores(); c++) { if (c != HOME_CORE) { // send totals back to all the other cores size_t n_per_msg = MAX_MESSAGE_SIZE / sizeof(T); for (size_t k=0; k<nelem; k+=n_per_msg) { size_t this_nelem = MIN(n_per_msg, nelem-k); pool.send_message(c, [this,k](void * payload, size_t psz){ auto total_k = static_cast<T*>(payload); auto in_n = psz / sizeof(T); for (size_t i=0; i<in_n; i++) { this->array[k+i] = total_k[i]; } this->ce->complete(); DVLOG(3) << "incrementing sem, now at " << ce->get_count(); }, this->array+k, sizeof(T)*this_nelem); } } } // once all messages are sent, HOME_CORE's task continues } }