Ejemplo n.º 1
0
 void collect_reduction(const T& val) {
   static T total;
   static Core cores_in = 0;
   
   DCHECK(mycore() == HOME_CORE);
   
   if (cores_in == 0) {
     total = val;
   } else {
     total = ReduceOp(total, val);
   }
   
   cores_in++;
   DVLOG(4) << "cores_in: " << cores_in;
   
   if (cores_in == cores()) {
     cores_in = 0;
     T tmp_total = total;
     for (Core c = 0; c < cores(); c++) {
       send_heap_message(c, [tmp_total] {
         Reduction<T>::result.writeXF(tmp_total);
       });
     }
   }
 }
Ejemplo n.º 2
0
 static GlobalAddress<GlobalBag> create(size_t total_capacity) {
   auto self = symmetric_global_alloc<GlobalBag>();
   auto n = total_capacity / cores()
            + total_capacity % cores();
   call_on_all_cores([=]{
     new (self.localize()) GlobalBag(self, n);
   });
   return self;
 }
Ejemplo n.º 3
0
// Export to R
RcppExport SEXP rflann_RadiusSearch(SEXP query_SEXP,
                                    SEXP ref_SEXP,
                                    SEXP radiusSEXP,
                                    SEXP max_neighbourSEXP,
                                    SEXP buildSEXP,
                                    SEXP coresSEXP,
                                    SEXP checksSEXP) {
BEGIN_RCPP
    Rcpp::RObject __result;
    Rcpp::RNGScope __rngScope;
    Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type
        query_(query_SEXP);
    Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type
        ref_(ref_SEXP);
    Rcpp::traits::input_parameter< double >::type
        radius(radiusSEXP);
    Rcpp::traits::input_parameter< int >::type
        max_neighbour(max_neighbourSEXP);
    Rcpp::traits::input_parameter< std::string >::type
        build(buildSEXP);
    Rcpp::traits::input_parameter< int >::type
        cores(coresSEXP);
    Rcpp::traits::input_parameter< int >::type
        checks(checksSEXP);
    __result = Rcpp::wrap(RadiusSearch(query_, ref_, radius,
                                       max_neighbour, build, cores, checks));
    return __result;
END_RCPP
}
Ejemplo n.º 4
0
 /// Mark a certain number of things completed. When the global count on all cores goes to 0, all
 /// tasks waiting on the GCE will be woken.
 ///
 /// Note: this can be called in a message handler (e.g. remote completes from stolen tasks).
 void complete(int64_t dec = 1) {
   count -= dec;
   DVLOG(4) << "complete (" << count << ") -- gce(" << this << ")";
   
   // out of work here
   if (count == 0) { // count[dec -> 0]
     // enter cancellable barrier
     send_heap_message(master_core, [this] {
       cores_out--;
       DVLOG(4) << "core entered barrier (cores_out:"<< cores_out <<")";
       
       // if all are in
       if (cores_out == 0) { // cores_out[1 -> 0]
         CHECK_EQ(count, 0);
         // notify everyone to wake
         for (Core c = 0; c < cores(); c++) {
           send_heap_message(c, [this] {
             CHECK_EQ(count, 0);
             DVLOG(3) << "broadcast";
             broadcast(&cv); // wake anyone who was waiting here
             reset(); // reset, now anyone else calling `wait` should fall through
           });
         }
       }
     });
   }
 }
Ejemplo n.º 5
0
 void call_on_all_cores(F work) {
   Core origin = mycore();
   CompletionEvent ce(cores()-1);
   
   auto lsz = [&ce,origin,work]{};
   MessagePool pool(cores()*(sizeof(Message<decltype(lsz)>)));
   
   for (Core c = 0; c < cores(); c++) if (c != mycore()) {
     pool.send_message(c, [&ce, origin, work] {
       work();
       send_heap_message(origin, [&ce]{ ce.complete(); });
     });
   }
   work(); // do my core's work
   ce.wait();
 }
Ejemplo n.º 6
0
 void on_all_cores(F work) {
   
   CompletionEvent ce(cores());
   auto ce_addr = make_global(&ce);
   
   auto lsz = [ce_addr,work]{};
   MessagePool pool(cores()*(sizeof(Message<decltype(lsz)>)));
   
   for (Core c = 0; c < cores(); c++) {
     pool.send_message(c, [ce_addr, work] {
       spawn([ce_addr, work] {
         work();
         complete(ce_addr);
       });
     });
   }
   ce.wait();
 }
Ejemplo n.º 7
0
   T reduce( GlobalAddress<P> localizable ) {
     CompletionEvent ce(cores() - 1); 

     T total = Accessor(localizable);
     Core origin = mycore();

     for (Core c=0; c<cores(); c++) {
      if (c != origin) {
        send_heap_message(c, [localizable, &ce, &total, origin]{
          T val = Accessor(localizable);
          send_heap_message(origin, [val,&ce,&total] {
            total = ReduceOp(total, val);
            ce.complete();
          });
        });
      }
     }
    ce.wait();
    return total;
   }
Ejemplo n.º 8
0
GlobalAddress<T> symmetric_global_alloc() {
  static_assert(sizeof(T) % block_size == 0,
                "must pad global proxy to multiple of block_size, or use GRAPPA_BLOCK_ALIGNED");
  // allocate enough space that we are guaranteed to get one on each core at same location
  auto qac = global_alloc<char>(cores()*(sizeof(T)+block_size));
  while (qac.core() != MASTER_CORE) qac++;
  auto qa = static_cast<GlobalAddress<T>>(qac);
  CHECK_EQ(qa, qa.block_min());
  CHECK_EQ(qa.core(), MASTER_CORE);
  return qa;
}
Ejemplo n.º 9
0
 T reduce(T * global_ptr) {
   //NOTE: this is written in a continuation passing
   //style to avoid the use of a GCE which async delegates only support
   CompletionEvent ce(cores()-1);
   // TODO: look into optionally stack-allocating pool storage like in IncoherentAcquirer.
   MessagePool pool(cores() * sizeof(Message<std::function<void(T*)>>));
 
   T total = *global_ptr;
   Core origin = mycore();
   
   for (Core c=0; c<cores(); c++) {
     if (c != origin) {
       pool.send_message(c, [global_ptr, &ce, &total, origin]{
         T val = *global_ptr;
         send_heap_message(origin, [val,&ce,&total] {
           total = ReduceOp(total, val);
           ce.complete();
         });
       });
     }
   }
   ce.wait();
   return total;
 }
Ejemplo n.º 10
0
//////////////////////////////////
// Main entry function of the GPU cache model
//////////////////////////////////
int main(int argc, char** argv) {
    srand(time(0));
    std::cout << SPLIT_STRING << std::endl;
    message("");

    // Flush messages as soon as possible
    std::cout.setf(std::ios_base::unitbuf);

    // Read the hardware settings from file
    Settings hardware = get_settings();

    // Print cache statistics
    message("Cache configuration:");
    std::cout << "### \t Cache size: ~" << hardware.cache_bytes/1024 << "KB" << std::endl;
    std::cout << "### \t Line size: " << hardware.line_size << " bytes" << std::endl;
    std::cout << "### \t Layout: " << hardware.cache_ways << " ways, " << hardware.cache_sets << " sets" << std::endl;
    message("");

    // Parse the input argument and make sure that there is only one
    if (argc != 3) {
        message("Error: provide one argument only (a folder containing input trace files)");
        message("");
        std::cout << SPLIT_STRING << std::endl;
        exit(1);
    }
    std::string benchname = argv[1];
    std::string suitename = argv[2];

    // Loop over all found traces in the folder (one trace per kernel)
    for (unsigned kernel_id = 0; kernel_id < 20; kernel_id++) {
        std::vector<Thread> threads(MAX_THREADS);
        for (unsigned t=0; t<MAX_THREADS; t++) {
            threads[t] = Thread();
        }

        // Set the kernelname and include a counter
        std::string kernelname;
        if (kernel_id < 10) {
            kernelname = benchname+"_0"+std::to_string(kernel_id);
        }
        else {
            kernelname = benchname+"_" +std::to_string(kernel_id);
        }

        // Load a memory access trace from a file
        Dim3 blockdim = read_file(threads, kernelname, benchname, suitename);
        unsigned blocksize = blockdim.x*blockdim.y*blockdim.z;

        // There was not a single trace that could be found - exit with an error
        if (blocksize == 0 && kernel_id == 0) {
            std::cout << "### Error: could not read file 'output/" << benchname << "/" << kernelname << ".trc'" << std::endl;
            message("");
            std::cout << SPLIT_STRING << std::endl;
            exit(1);
        }

        // The final tracefile is already processed, exit the loop
        if (blocksize == 0) {
            break;
        }

        // Assign threads to warps, threadblocks and GPU cores
        message("");
        std::cout << "### Assigning threads to warps/blocks/cores...";
        unsigned num_blocks = ceil(threads.size()/(float)(blocksize));
        unsigned num_warps_per_block = ceil(blocksize/(float)(hardware.warp_size));
        std::vector<std::vector<unsigned>> warps(num_warps_per_block*num_blocks);
        std::vector<std::vector<unsigned>> blocks(num_blocks);
        std::vector<std::vector<unsigned>> cores(hardware.num_cores);
        schedule_threads(threads, warps, blocks, cores, hardware, blocksize);
        std::cout << "done" << std::endl;

        // Model only a single core, modelling multiple cores requires a loop over 'cid'
        unsigned cid = 0;

        // Compute the number of active blocks on this core
        unsigned hardware_max_active_blocks = std::min(hardware.max_active_threads/blocksize, hardware.max_active_blocks);
        unsigned active_blocks = std::min((unsigned)cores[cid].size(), hardware_max_active_blocks);

        // Start the computation of the reuse distance profile
        message("");
        std::cout << "### [core " << cid << "]:" << std::endl;
        std::cout << "### Running " << active_blocks << " block(s) at a time" << std::endl;
        std::cout << "### Calculating the reuse distances";

        // Create a Gaussian distribution to model memory latencies
        std::random_device random;
        std::mt19937 gen(random());

        // Compute the reuse distance for 4 different cases
        std::vector<map_type<unsigned,unsigned>> distances(NUM_CASES);
        for (unsigned runs = 0; runs < NUM_CASES; runs++) {
            std::cout << "...";
            unsigned sets, ways;
            unsigned ml, ms, nml;
            unsigned mshr;

            // CASE 0 | Normal - full model
            sets = hardware.cache_sets;
            ways = hardware.cache_ways;
            ml = hardware.mem_latency;
            ms = hardware.mem_latency_stddev;
            nml = NON_MEM_LATENCY;
            mshr = hardware.num_mshr;

            // CASE 1 | Only 1 set: don't model associativity
            if (runs == 1) {
                sets = 1;
                ways = hardware.cache_ways*hardware.cache_sets;
            }

            // CASE 2 | Memory latency to 0: don't model latencies
            if (runs == 2) {
                ml = 0;
                ms = 0;
                nml = 0;
            }

            // CASE 3 | MSHR count to infinite: don't model MSHRs
            if (runs == 3) {
                mshr = INF;
            }

            // Calculate the reuse distance profile
            std::normal_distribution<> distribution(0,ms);
            reuse_distance(cores[cid], blocks, warps, threads, distances[runs], active_blocks, hardware,
                           sets, ways, ml, nml, mshr, gen, distribution);
        }
        std::cout << "done" << std::endl;

        // Process the reuse distance profile to obtain the cache hit/miss rate
        message("");
        output_miss_rate(distances, kernelname, benchname,suitename, hardware);

        // Display the cache hit/miss rate from the output of the verifier (if available)
        message("");
        verify_miss_rate(kernelname, benchname);
        message("");
    }

    // End of the program
    std::cout << SPLIT_STRING << std::endl;
    return 0;
}
Ejemplo n.º 11
0
int Graph::colore(int precision, bool cor)
{
    int nCrom = this->n;
    std::vector<Vertex> verts(this->n);
    std::vector<int> cores(this->n);
    std::priority_queue<Vertex, std::vector<Vertex>, std::less<Vertex> > heaps;

    int k;
    for(k=0; k < precision; k++)
    {
        for(int j = 0; j < this->n; j++)
        {
                verts[j].value = rand() % vDeg[j];
                verts[j].index = j+1;
                heaps.push(verts[j]);
        }
    Vertex v;
    int i;
    int l;
    bool ok = true;
    int crom = 0;

    //Vetor de filas, em que a i-ésima fila corresponde aos vértices que possuem a cor i
    std::vector< std::deque<int> > C(this->n);

    while (!heaps.empty())
    {
        v = heaps.top();
        heaps.pop();
        i = 0;
        while (true)
        {
            for(std::deque<int>::iterator it = C[i].begin(); it < C[i].end(); it++)
            {
                for (std::vector<int>::iterator j = this->vec[v.index-1].begin(); j < this->vec[v.index-1].end(); j++)
                {
                    if(*j == *it)
                    {
                        ok = false;
                        break; //o loop interno
                    }
                }
                if (!ok)
                {
                    break;
                }
            }
            if(ok)
            {
                C[i].insert(C[i].end(),v.index);
                if (crom < i+1)
                {
                    crom = i+1;
                }
                break;
            }
            i++;
            ok = true;
        }
    }
    if (nCrom > crom)
    {
        nCrom = crom;
        if (cor)
        {
            l = 0;
            while (true)
            {
                if (C[l].empty()) break;
                for (std::deque<int>::iterator pos = C[l].begin(); pos < C[l].end(); pos++)
                {
                    cores[*pos-1] = l;
                }
                l++;
            }
        }
    }
    }
    if (cor) this->colors = cores;
    return nCrom;
}
Ejemplo n.º 12
0
      /// SPMD, must be called on static/file-global object on all cores
      /// blocks until reduction is complete
      void call_allreduce(T * in_array, size_t nelem) {
        // setup everything (block to make sure HOME_CORE is done)
        this->array = in_array;
        this->nelem = nelem;
        
        size_t n_per_msg = MAX_MESSAGE_SIZE / sizeof(T);
        size_t nmsg = nelem / n_per_msg + (nelem % n_per_msg ? 1 : 0);
        auto nmsg_total = nmsg*(cores()-1);

        CompletionEvent local_ce;
        this->ce = &local_ce;
        this->ce->enroll( (mycore() == HOME_CORE) ? nmsg_total : nmsg );
        barrier();
        
        if (mycore() != HOME_CORE) {
          for (size_t k=0; k<nelem; k+=n_per_msg) {
            size_t this_nelem = MIN(n_per_msg, nelem-k);
            
            // everyone sends their contribution to HOME_CORE, last one wakes HOME_CORE
            send_heap_message(HOME_CORE, [this,k](void * payload, size_t payload_size) {
              DCHECK(mycore() == HOME_CORE);
      
              auto in_array = static_cast<T*>(payload);
              auto in_n = payload_size/sizeof(T);
              auto total = this->array+k;
      
              for (size_t i=0; i<in_n; i++) {
                total[i] = ReduceOp(total[i], in_array[i]);
              }
              DVLOG(3) << "incrementing HOME sem, now at " << ce->get_count();      
              this->ce->complete();
            }, (void*)(in_array+k), sizeof(T)*this_nelem);
          }
          
          DVLOG(3) << "about to block for " << nelem << " with sem == " << ce->get_count();           this->ce->wait();
          
        } else {
          auto nmsg_total = nmsg*(cores()-1);
          
          // home core waits until woken by last received message from other cores
          this->ce->wait();
          DVLOG(3) << "woke with sem == " << ce->get_count();
          
          // send total to everyone else and wake them
          char msg_buf[(cores()-1)*sizeof(PayloadMessage<std::function<void(decltype(this),size_t)>>)];
          MessagePool pool(msg_buf, sizeof(msg_buf));
          for (Core c = 0; c < cores(); c++) {
            if (c != HOME_CORE) {
              // send totals back to all the other cores
              size_t n_per_msg = MAX_MESSAGE_SIZE / sizeof(T);
              for (size_t k=0; k<nelem; k+=n_per_msg) {
                size_t this_nelem = MIN(n_per_msg, nelem-k);
                pool.send_message(c, [this,k](void * payload, size_t psz){
                  auto total_k = static_cast<T*>(payload);
                  auto in_n = psz / sizeof(T);
                  for (size_t i=0; i<in_n; i++) {
                    this->array[k+i] = total_k[i];
                  }
                  this->ce->complete();
                  DVLOG(3) << "incrementing sem, now at " << ce->get_count();
                }, this->array+k, sizeof(T)*this_nelem);              
              }
            }
          }
          // once all messages are sent, HOME_CORE's task continues
        }
      }