void Balancer::perfBalanceGPU(Cluster &cluster, Decomposition& decomp, const double kTimeEstimate) { const int kGPUOnly = 2; WorkQueue work_queue; WorkRequest work_request; const int kNumTotalGPUs = cluster.getNumTotalGPUs(); if (decomp.getNumSubDomains() == 0 || kNumTotalGPUs == 0) return; for (int gpu_index = 0; gpu_index < kNumTotalGPUs; ++gpu_index) { Node& gpu = cluster.getGlobalGPU(gpu_index); // fastest gpu will have largest weight, and thus move to front of queue work_request.setTimeDiff(kTimeEstimate - gpu.getBalTimeEst(1, kGPUOnly)); work_request.setIndex(gpu_index); work_queue.push(work_request); } const int kNumBlocks = decomp.getNumSubDomains(); // place data blocks on gpu's one-at-a-time for (int block_id = 0; block_id < kNumBlocks; ++block_id) { work_request = work_queue.top(); work_queue.pop(); Node& gpu = cluster.getGlobalGPU(work_request.getIndex()); gpu.incrementBalCount(); double time_diff = gpu.getBalTimeEst(1, kGPUOnly); work_request.setTimeDiff(time_diff); work_queue.push(work_request); //printWorkQueue(work_queue); } cluster.distributeBlocks(&decomp); }
void Balancer::balance(Cluster &cluster, Decomposition& decomp, const int kConfig) { const int kCPUAndGPU = 0; const int kCPUOnly = 1; const int kGPUOnly = 2; int blocks_per_node = 0; // num cpu nodes unsigned int total_nodes = cluster.getNumNodes(); size_t num_blocks = decomp.getNumSubDomains(); // initialize block directory cluster.setNumBlocks(num_blocks); if (kConfig != kCPUOnly) { unsigned int num_gpus = 0; for (unsigned int node_index = 0; node_index < cluster.getNumNodes(); ++node_index) { num_gpus += cluster.getNode(node_index).getNumChildren(); } if (kConfig == kGPUOnly) // gpu only total_nodes = num_gpus; else if (kConfig == kCPUAndGPU) // cpu and gpu total_nodes += num_gpus; } blocks_per_node = ceil(decomp.getNumSubDomains() / (float) total_nodes); for (unsigned int node_index = 0; node_index < cluster.getNumNodes(); ++node_index) { Node& node = cluster.getNode(node_index); if (kConfig == kCPUOnly || kConfig == kCPUAndGPU) { for (int subd = 0; subd < blocks_per_node && 0 < decomp.getNumSubDomains(); ++subd) { SubDomain *block = decomp.popSubDomain(); node.addSubDomain(block); } } if (kConfig == kGPUOnly || kConfig == kCPUAndGPU) { for (unsigned int gpu_index = 0; gpu_index < node.getNumChildren(); ++gpu_index) { Node& gpu = node.getChild(gpu_index); for (int subd = 0; subd < blocks_per_node && 0 < decomp.getNumSubDomains(); ++subd) { SubDomain *block = decomp.popSubDomain(); gpu.addSubDomain(block); } } } } /* the work is balanced, so we can fill the block directory */ cluster.storeBlockLocs(); }
void Balancer::perfBalanceStrongestDevice(Cluster &cluster, Decomposition& decomp) { WorkQueue work_queue; WorkRequest work_request; double total_weight(0.0); double min_edge_weight(0.0); double procTime(0.0); double commTime(0.0); double timeEst = procTime; bool changed(false); const int kStrongest(3); const int kConfig = kStrongest; Node &root = cluster.getNode(0); const size_t kNumBlocks = decomp.getNumSubDomains(); // initialize block directory cluster.setNumBlocks(kNumBlocks); //get total iterations per second for cluster for (unsigned int node = 0; node < cluster.getNumNodes(); ++node) { total_weight += cluster.getNode(node).getTotalWeight(kConfig); min_edge_weight += cluster.getNode(node).getMinEdgeWeight(kConfig); } // quick estimation of runtime procTime = (0.0 == total_weight) ? std::numeric_limits<double>::max() : kNumBlocks / total_weight; commTime = (0.0 == min_edge_weight) ? std::numeric_limits<double>::max() : kNumBlocks / min_edge_weight; timeEst = procTime; timeEst += (std::numeric_limits<double>::max() - procTime >= commTime) ? commTime : 0.0; //printf("timeEst:%f\n", timeEst); // place all of the blocks on the root node for (size_t i = 0; i < kNumBlocks; ++i) root.incrementBalCount(); /*fprintf(stderr, "perfBalance: \n\ttime est: %f sec\n\tprocTime: %f sec\n\tcommTime: %f \ sec\n\ttotal weight:%e \n\tmin edge weight:%e.\n", timeEst, procTime, commTime, total_weight, min_edge_weight); // */ do { changed = false; //printf("beginning, changed == %s\n", (changed == true) ? "true" : "false"); // balance the work between nodes and root for (unsigned int cpu_index = 1; cpu_index < cluster.getNumNodes(); ++cpu_index) { Node& cpu_node = cluster.getNode(cpu_index); int work_deficit = cpu_node.getTotalWorkNeeded(timeEst, kConfig) - cpu_node.getBalCount(); if (0 > work_deficit) { // node has extra work //printf("cpu node %d has %d extra blocks.\n", cpu_index, work_deficit); int extra_blocks = abs(work_deficit); for (int block_index = 0; (block_index < extra_blocks) && (0 < cpu_node.getBalCount()); ++block_index) { // move block from child to parent cpu_node.decrementBalCount(); root.incrementBalCount(); changed = true; } } else if (0 < work_deficit) { //child needs more work work_request.setTimeDiff(timeEst - cpu_node.getBalTimeEst(0, kConfig)); work_request.setIndex(cpu_index); work_queue.push(work_request); } } // go through all of root nodes gpus for (unsigned int index = 0; index < root.getNumChildren(); ++index) { Node& node = root.getChild(index); int work_deficit = node.getTotalWorkNeeded(timeEst, kConfig) - node.getBalCount(); if (0 > work_deficit) { // child has extra work //printf("root child %d has %d blocks and only needs %d blocks.\n", index, // node.getBalCount(), node.getTotalWorkNeeded(timeEst, kConfig)); int extra_blocks = abs(work_deficit); for (int block_index = 0; (block_index < extra_blocks) && (0 < node.getBalCount()); ++block_index) { // move block from child to parent node.decrementBalCount(); root.incrementBalCount(); //changed = true; } } else if (0 < work_deficit) { // child needs more work work_request.setTimeDiff(timeEst - node.getBalTimeEst(0, kConfig)); work_request.setIndex(-1 * index); // hack so I know to give to one of root's children work_queue.push(work_request); } } /* at this point we have all extra blocks, and now we need to distribute blocks to children that need it */ //printf("after collecting extra blocks from children, changed == %s\n", // (changed == true) ? "true" : "false"); // while root has extra blocks and there are requests left to fill while (0 < (root.getBalCount() - root.getTotalWorkNeeded(timeEst, kConfig)) && !work_queue.empty()) { //printf("root needs %d blocks and has %d blocks.\n", // root.getTotalWorkNeeded(timeEst, kConfig), root.getBalCount()); // get largest request WorkRequest tmp = work_queue.top(); work_queue.pop(); double newTimeDiff = 0.0; int id = tmp.getIndex(); if (id <= 0) { // local child //printf("giving block to local child.\n"); id = -1 * id; root.decrementBalCount(); root.getChild(id).incrementBalCount(); newTimeDiff = timeEst - root.getChild(id).getBalTimeEst(0, kConfig); changed = true; } else { // request was from another node in cluster //printf("giving block to cpu node child.\n"); root.decrementBalCount(); cluster.getNode(id).incrementBalCount(); newTimeDiff = timeEst - cluster.getNode(id).getBalTimeEst(0, kConfig); changed = true; } // if there is still work left to do put it back on // the queue so that it will reorder correctly if (0 < newTimeDiff) { tmp.setTimeDiff(newTimeDiff); work_queue.push(tmp); } } //printf("after distributing extra blocks to cpu nodes, changed == %s\n", // (changed == true) ? "true" : "false"); // balance the work within each node for (unsigned int node = 0; node < cluster.getNumNodes(); ++node) { balanceNode(cluster.getNode(node), timeEst, kConfig); //changed |= balanceNode(cluster.getNode(node), timeEst, kConfig); } //printf("after balancing within each node, changed == %s\n", // (changed == true) ? "true" : "false"); //printClusterBalCount(cluster); //printf("************* END OF BALANCE ITERATION ***********\n"); } while (changed); }
void Balancer::perfBalance(Cluster &cluster, Decomposition& decomp, const int kConfig) { WorkQueue work_queue; WorkRequest work_request; double total_weight(0.0); double min_edge_weight(0.0); double procTime(0.0); double commTime(0.0); double timeEst = procTime; bool changed(false); const int kGPUOnly(2); const int kStrongest(3); Node &root = cluster.getNode(0); size_t num_blocks = decomp.getNumSubDomains(); // initialize block directory cluster.setNumBlocks(num_blocks); //get total iterations per second for cluster for (unsigned int node = 0; node < cluster.getNumNodes(); ++node) { total_weight += cluster.getNode(node).getTotalWeight(kConfig); min_edge_weight += cluster.getNode(node).getMinEdgeWeight(kConfig); } // quick estimation of runtime procTime = num_blocks / total_weight; commTime = num_blocks / min_edge_weight; timeEst = procTime; if (0.0 < min_edge_weight) timeEst += commTime; if (kGPUOnly == kConfig) { perfBalanceGPU(cluster, decomp, timeEst); } else if (kStrongest == kConfig) { perfBalanceStrongestDevice(cluster, decomp); } else { // perform initial task distribution for (size_t i = 0; i < num_blocks; ++i) root.incrementBalCount(); /*fprintf(stderr, "perfBalance: \n\ttime est: %f sec\n\tprocTime: %f sec\n\tcommTime: %f \ sec\n\ttotal weight:%e \n\tmin edge weight:%e.\n", timeEst, procTime, commTime, total_weight, min_edge_weight); // */ do { changed = false; // balance the work between nodes and root for (unsigned int cpu_index = 1; cpu_index < cluster.getNumNodes(); ++cpu_index) { Node& cpu_node = cluster.getNode(cpu_index); int work_deficit = cpu_node.getTotalWorkNeeded(timeEst, kConfig) - cpu_node.getBalCount(); if (0 > work_deficit) { // node has extra work int extra_blocks = abs(work_deficit); for (int block_index = 0; (block_index < extra_blocks) && (0 < cpu_node.getBalCount()); ++block_index) { // move block from child to parent cpu_node.decrementBalCount(); root.incrementBalCount(); changed = true; } } else if (0 < work_deficit) { //child needs more work work_request.setTimeDiff(timeEst - cpu_node.getBalTimeEst(0, kConfig)); work_request.setIndex(cpu_index); work_queue.push(work_request); } } for (unsigned int cpu_index = 0; cpu_index < root.getNumChildren(); ++cpu_index) { Node& cpu_node = root.getChild(cpu_index); int work_deficit = cpu_node.getTotalWorkNeeded(timeEst, kConfig) - cpu_node.getBalCount(); if (0 > work_deficit) { // child has extra work int extra_blocks = abs(work_deficit); for (int block_index = 0; (block_index < extra_blocks) && (0 < cpu_node.getBalCount()); ++block_index) { // move block from child to parent cpu_node.decrementBalCount(); root.incrementBalCount(); changed = true; } } else if (0 < work_deficit) { // child needs more work work_request.setTimeDiff(timeEst - cpu_node.getBalTimeEst(0, kConfig)); work_request.setIndex(-1 * cpu_index); // hack so I know to give to one of root's children work_queue.push(work_request); } } /* at this point we have all extra blocks, and now we need to distribute blocks to children that need it */ while (0 < root.getBalCount() && // there are blocks left to give !work_queue.empty()) { // there are requests left to fill // get largest request WorkRequest tmp = work_queue.top(); work_queue.pop(); double newTimeDiff = 0.0; int id = tmp.getIndex(); if (id <= 0) { // local child id = -1 * id; root.decrementBalCount(); root.getChild(id).incrementBalCount(); newTimeDiff = timeEst - root.getChild(id).getBalTimeEst(0, kConfig); changed = true; } else { // request was from another node in cluster root.decrementBalCount(); cluster.getNode(id).incrementBalCount(); newTimeDiff = timeEst - cluster.getNode(id).getBalTimeEst(0, kConfig); changed = true; } // if there is still work left to do put it back on // the queue so that it will reorder correctly if (0 < newTimeDiff) { tmp.setTimeDiff(newTimeDiff); work_queue.push(tmp); } } // balance the work within each node for (unsigned int node = 0; node < cluster.getNumNodes(); ++node) { changed |= balanceNode(cluster.getNode(node), timeEst, kConfig); } } while (changed); } /* now that we know where everything should go, distribute the blocks */ cluster.distributeBlocks(&decomp); /* the work is balanced, so we can fill the block directory */ cluster.storeBlockLocs(); }