void Balancer::perfBalanceGPU(Cluster &cluster, Decomposition& decomp, const double kTimeEstimate) { const int kGPUOnly = 2; WorkQueue work_queue; WorkRequest work_request; const int kNumTotalGPUs = cluster.getNumTotalGPUs(); if (decomp.getNumSubDomains() == 0 || kNumTotalGPUs == 0) return; for (int gpu_index = 0; gpu_index < kNumTotalGPUs; ++gpu_index) { Node& gpu = cluster.getGlobalGPU(gpu_index); // fastest gpu will have largest weight, and thus move to front of queue work_request.setTimeDiff(kTimeEstimate - gpu.getBalTimeEst(1, kGPUOnly)); work_request.setIndex(gpu_index); work_queue.push(work_request); } const int kNumBlocks = decomp.getNumSubDomains(); // place data blocks on gpu's one-at-a-time for (int block_id = 0; block_id < kNumBlocks; ++block_id) { work_request = work_queue.top(); work_queue.pop(); Node& gpu = cluster.getGlobalGPU(work_request.getIndex()); gpu.incrementBalCount(); double time_diff = gpu.getBalTimeEst(1, kGPUOnly); work_request.setTimeDiff(time_diff); work_queue.push(work_request); //printWorkQueue(work_queue); } cluster.distributeBlocks(&decomp); }
void Balancer::perfBalance(Cluster &cluster, Decomposition& decomp, const int kConfig) { WorkQueue work_queue; WorkRequest work_request; double total_weight(0.0); double min_edge_weight(0.0); double procTime(0.0); double commTime(0.0); double timeEst = procTime; bool changed(false); const int kGPUOnly(2); const int kStrongest(3); Node &root = cluster.getNode(0); size_t num_blocks = decomp.getNumSubDomains(); // initialize block directory cluster.setNumBlocks(num_blocks); //get total iterations per second for cluster for (unsigned int node = 0; node < cluster.getNumNodes(); ++node) { total_weight += cluster.getNode(node).getTotalWeight(kConfig); min_edge_weight += cluster.getNode(node).getMinEdgeWeight(kConfig); } // quick estimation of runtime procTime = num_blocks / total_weight; commTime = num_blocks / min_edge_weight; timeEst = procTime; if (0.0 < min_edge_weight) timeEst += commTime; if (kGPUOnly == kConfig) { perfBalanceGPU(cluster, decomp, timeEst); } else if (kStrongest == kConfig) { perfBalanceStrongestDevice(cluster, decomp); } else { // perform initial task distribution for (size_t i = 0; i < num_blocks; ++i) root.incrementBalCount(); /*fprintf(stderr, "perfBalance: \n\ttime est: %f sec\n\tprocTime: %f sec\n\tcommTime: %f \ sec\n\ttotal weight:%e \n\tmin edge weight:%e.\n", timeEst, procTime, commTime, total_weight, min_edge_weight); // */ do { changed = false; // balance the work between nodes and root for (unsigned int cpu_index = 1; cpu_index < cluster.getNumNodes(); ++cpu_index) { Node& cpu_node = cluster.getNode(cpu_index); int work_deficit = cpu_node.getTotalWorkNeeded(timeEst, kConfig) - cpu_node.getBalCount(); if (0 > work_deficit) { // node has extra work int extra_blocks = abs(work_deficit); for (int block_index = 0; (block_index < extra_blocks) && (0 < cpu_node.getBalCount()); ++block_index) { // move block from child to parent cpu_node.decrementBalCount(); root.incrementBalCount(); changed = true; } } else if (0 < work_deficit) { //child needs more work work_request.setTimeDiff(timeEst - cpu_node.getBalTimeEst(0, kConfig)); work_request.setIndex(cpu_index); work_queue.push(work_request); } } for (unsigned int cpu_index = 0; cpu_index < root.getNumChildren(); ++cpu_index) { Node& cpu_node = root.getChild(cpu_index); int work_deficit = cpu_node.getTotalWorkNeeded(timeEst, kConfig) - cpu_node.getBalCount(); if (0 > work_deficit) { // child has extra work int extra_blocks = abs(work_deficit); for (int block_index = 0; (block_index < extra_blocks) && (0 < cpu_node.getBalCount()); ++block_index) { // move block from child to parent cpu_node.decrementBalCount(); root.incrementBalCount(); changed = true; } } else if (0 < work_deficit) { // child needs more work work_request.setTimeDiff(timeEst - cpu_node.getBalTimeEst(0, kConfig)); work_request.setIndex(-1 * cpu_index); // hack so I know to give to one of root's children work_queue.push(work_request); } } /* at this point we have all extra blocks, and now we need to distribute blocks to children that need it */ while (0 < root.getBalCount() && // there are blocks left to give !work_queue.empty()) { // there are requests left to fill // get largest request WorkRequest tmp = work_queue.top(); work_queue.pop(); double newTimeDiff = 0.0; int id = tmp.getIndex(); if (id <= 0) { // local child id = -1 * id; root.decrementBalCount(); root.getChild(id).incrementBalCount(); newTimeDiff = timeEst - root.getChild(id).getBalTimeEst(0, kConfig); changed = true; } else { // request was from another node in cluster root.decrementBalCount(); cluster.getNode(id).incrementBalCount(); newTimeDiff = timeEst - cluster.getNode(id).getBalTimeEst(0, kConfig); changed = true; } // if there is still work left to do put it back on // the queue so that it will reorder correctly if (0 < newTimeDiff) { tmp.setTimeDiff(newTimeDiff); work_queue.push(tmp); } } // balance the work within each node for (unsigned int node = 0; node < cluster.getNumNodes(); ++node) { changed |= balanceNode(cluster.getNode(node), timeEst, kConfig); } } while (changed); } /* now that we know where everything should go, distribute the blocks */ cluster.distributeBlocks(&decomp); /* the work is balanced, so we can fill the block directory */ cluster.storeBlockLocs(); }