示例#1
0
void Balancer::perfBalanceGPU(Cluster &cluster, Decomposition& decomp,
        const double kTimeEstimate) {
  const int kGPUOnly = 2;
  WorkQueue work_queue;
  WorkRequest work_request;
  const int kNumTotalGPUs = cluster.getNumTotalGPUs();
  if (decomp.getNumSubDomains() == 0 || kNumTotalGPUs == 0)
    return;
  for (int gpu_index = 0; gpu_index < kNumTotalGPUs; ++gpu_index) {
    Node& gpu = cluster.getGlobalGPU(gpu_index);
    // fastest gpu will have largest weight, and thus move to front of queue
    work_request.setTimeDiff(kTimeEstimate - gpu.getBalTimeEst(1, kGPUOnly));
    work_request.setIndex(gpu_index);
    work_queue.push(work_request);
  }

  const int kNumBlocks = decomp.getNumSubDomains();
  // place data blocks on gpu's one-at-a-time
  for (int block_id = 0; block_id < kNumBlocks; ++block_id) {
    work_request = work_queue.top();
    work_queue.pop();

    Node& gpu = cluster.getGlobalGPU(work_request.getIndex());
    gpu.incrementBalCount();

    double time_diff = gpu.getBalTimeEst(1, kGPUOnly);

    work_request.setTimeDiff(time_diff);
    work_queue.push(work_request);
    //printWorkQueue(work_queue);
  }

  cluster.distributeBlocks(&decomp);
}
示例#2
0
void Balancer::balance(Cluster &cluster, Decomposition& decomp,
        const int kConfig) {
  const int kCPUAndGPU = 0;
  const int kCPUOnly = 1;
  const int kGPUOnly = 2;
  int blocks_per_node = 0;
  // num cpu nodes
  unsigned int total_nodes = cluster.getNumNodes();
  size_t num_blocks = decomp.getNumSubDomains();

  // initialize block directory
  cluster.setNumBlocks(num_blocks);

  if (kConfig != kCPUOnly) {
    unsigned int num_gpus = 0;
    for (unsigned int node_index = 0;
            node_index < cluster.getNumNodes();
            ++node_index) {
      num_gpus += cluster.getNode(node_index).getNumChildren();
    }
    if (kConfig == kGPUOnly) // gpu only
      total_nodes = num_gpus;
    else if (kConfig == kCPUAndGPU) // cpu and gpu
      total_nodes += num_gpus;
  }

  blocks_per_node = ceil(decomp.getNumSubDomains() / (float) total_nodes);
  for (unsigned int node_index = 0;
          node_index < cluster.getNumNodes();
          ++node_index) {
    Node& node = cluster.getNode(node_index);
    if (kConfig == kCPUOnly || kConfig == kCPUAndGPU) {
      for (int subd = 0;
              subd < blocks_per_node && 0 < decomp.getNumSubDomains();
              ++subd) {
        SubDomain *block = decomp.popSubDomain();
        node.addSubDomain(block);
      }
    }
    if (kConfig == kGPUOnly || kConfig == kCPUAndGPU) {
      for (unsigned int gpu_index = 0;
              gpu_index < node.getNumChildren();
              ++gpu_index) {
        Node& gpu = node.getChild(gpu_index);
        for (int subd = 0;
                subd < blocks_per_node && 0 < decomp.getNumSubDomains();
                ++subd) {
          SubDomain *block = decomp.popSubDomain();
          gpu.addSubDomain(block);
        }
      }
    }
  }
  /* the work is balanced, so we can fill the block directory */
  cluster.storeBlockLocs();
}
示例#3
0
void Balancer::perfBalanceStrongestDevice(Cluster &cluster, Decomposition& decomp) {
  WorkQueue work_queue;
  WorkRequest work_request;
  double total_weight(0.0);
  double min_edge_weight(0.0);
  double procTime(0.0);
  double commTime(0.0);
  double timeEst = procTime;
  bool changed(false);
  const int kStrongest(3);
  const int kConfig = kStrongest;
  Node &root = cluster.getNode(0);
  const size_t kNumBlocks = decomp.getNumSubDomains();

  // initialize block directory
  cluster.setNumBlocks(kNumBlocks);

  //get total iterations per second for cluster
  for (unsigned int node = 0; node < cluster.getNumNodes(); ++node) {
    total_weight += cluster.getNode(node).getTotalWeight(kConfig);
    min_edge_weight += cluster.getNode(node).getMinEdgeWeight(kConfig);
  }

  // quick estimation of runtime

  procTime = (0.0 == total_weight) ?
          std::numeric_limits<double>::max() :
          kNumBlocks / total_weight;
  commTime = (0.0 == min_edge_weight) ?
          std::numeric_limits<double>::max() :
          kNumBlocks / min_edge_weight;
  timeEst = procTime;
  timeEst += (std::numeric_limits<double>::max() - procTime >= commTime) ?
          commTime :
          0.0;
  //printf("timeEst:%f\n", timeEst);

  // place all of the blocks on the root node
  for (size_t i = 0; i < kNumBlocks; ++i)
    root.incrementBalCount();

  /*fprintf(stderr,
          "perfBalance: \n\ttime est: %f sec\n\tprocTime: %f sec\n\tcommTime: %f \
          sec\n\ttotal weight:%e \n\tmin edge weight:%e.\n",
          timeEst, procTime, commTime, total_weight, min_edge_weight);  // */

  do {
    changed = false;
    //printf("beginning, changed == %s\n", (changed == true) ? "true" : "false");
    // balance the work between nodes and root
    for (unsigned int cpu_index = 1;
            cpu_index < cluster.getNumNodes();
            ++cpu_index) {
      Node& cpu_node = cluster.getNode(cpu_index);
      int work_deficit = cpu_node.getTotalWorkNeeded(timeEst, kConfig) -
              cpu_node.getBalCount();
      if (0 > work_deficit) { // node has extra work
        //printf("cpu node %d has %d extra blocks.\n", cpu_index, work_deficit);
        int extra_blocks = abs(work_deficit);
        for (int block_index = 0;
                (block_index < extra_blocks) &&
                (0 < cpu_node.getBalCount());
                ++block_index) {
          // move block from child to parent
          cpu_node.decrementBalCount();
          root.incrementBalCount();
          changed = true;
        }
      } else if (0 < work_deficit) { //child needs more work
        work_request.setTimeDiff(timeEst - cpu_node.getBalTimeEst(0, kConfig));
        work_request.setIndex(cpu_index);
        work_queue.push(work_request);
      }
    }

    // go through all of root nodes gpus
    for (unsigned int index = 0; index < root.getNumChildren(); ++index) {
      Node& node = root.getChild(index);
      int work_deficit = node.getTotalWorkNeeded(timeEst, kConfig) -
              node.getBalCount();
      if (0 > work_deficit) { // child has extra work
        //printf("root child %d has %d blocks and only needs %d blocks.\n", index,
          //      node.getBalCount(), node.getTotalWorkNeeded(timeEst, kConfig));
        int extra_blocks = abs(work_deficit);
        for (int block_index = 0;
                (block_index < extra_blocks) && (0 < node.getBalCount());
                ++block_index) {
          // move block from child to parent
          node.decrementBalCount();
          root.incrementBalCount();
          //changed = true;
        }
      } else if (0 < work_deficit) { // child needs more work
        work_request.setTimeDiff(timeEst - node.getBalTimeEst(0, kConfig));
        work_request.setIndex(-1 * index); // hack so I know to give to one of root's children
        work_queue.push(work_request);
      }
    }

    /*
       at this point we have all extra blocks, and
       now we need to distribute blocks to children
       that need it
     */

    //printf("after collecting extra blocks from children, changed == %s\n",
      //      (changed == true) ? "true" : "false");
    // while root has extra blocks and there are requests left to fill
    while (0 < (root.getBalCount() - root.getTotalWorkNeeded(timeEst, kConfig)) &&
            !work_queue.empty()) {
      //printf("root needs %d blocks and has %d blocks.\n",
        //      root.getTotalWorkNeeded(timeEst, kConfig), root.getBalCount());
      // get largest request
      WorkRequest tmp = work_queue.top();
      work_queue.pop();
      
      double newTimeDiff = 0.0;
      int id = tmp.getIndex();
      if (id <= 0) { // local child
        //printf("giving block to local child.\n");
        id = -1 * id;
        root.decrementBalCount();
        root.getChild(id).incrementBalCount();
        newTimeDiff = timeEst - root.getChild(id).getBalTimeEst(0, kConfig);
        changed = true;
      } else { // request was from another node in cluster
        //printf("giving block to cpu node child.\n");
        root.decrementBalCount();
        cluster.getNode(id).incrementBalCount();
        newTimeDiff = timeEst - cluster.getNode(id).getBalTimeEst(0, kConfig);
        changed = true;
      }
      // if there is still work left to do put it back on
      // the queue so that it will reorder correctly
      if (0 < newTimeDiff) {
        tmp.setTimeDiff(newTimeDiff);
        work_queue.push(tmp);
      }
    }
    //printf("after distributing extra blocks to cpu nodes, changed == %s\n",
      //      (changed == true) ? "true" : "false");
    // balance the work within each node
    for (unsigned int node = 0; node < cluster.getNumNodes(); ++node) {
      balanceNode(cluster.getNode(node), timeEst, kConfig);
      //changed |= balanceNode(cluster.getNode(node), timeEst, kConfig);
    }
    //printf("after balancing within each node, changed == %s\n",
      //      (changed == true) ? "true" : "false");
    //printClusterBalCount(cluster);
    //printf("************* END OF BALANCE ITERATION ***********\n");
  } while (changed);
}
示例#4
0
void Balancer::perfBalance(Cluster &cluster, Decomposition& decomp,
        const int kConfig) {
  WorkQueue work_queue;
  WorkRequest work_request;
  double total_weight(0.0);
  double min_edge_weight(0.0);
  double procTime(0.0);
  double commTime(0.0);
  double timeEst = procTime;
  bool changed(false);
  const int kGPUOnly(2);
  const int kStrongest(3);
  Node &root = cluster.getNode(0);
  size_t num_blocks = decomp.getNumSubDomains();

  // initialize block directory
  cluster.setNumBlocks(num_blocks);

  //get total iterations per second for cluster
  for (unsigned int node = 0; node < cluster.getNumNodes(); ++node) {
    total_weight += cluster.getNode(node).getTotalWeight(kConfig);
    min_edge_weight += cluster.getNode(node).getMinEdgeWeight(kConfig);
  }

  // quick estimation of runtime
  procTime = num_blocks / total_weight;
  commTime = num_blocks / min_edge_weight;
  timeEst = procTime;
  if (0.0 < min_edge_weight)
    timeEst += commTime;

  if (kGPUOnly == kConfig) {
    perfBalanceGPU(cluster, decomp, timeEst);
  } else if (kStrongest == kConfig) {
    perfBalanceStrongestDevice(cluster, decomp);
  } else {
    // perform initial task distribution
    for (size_t i = 0; i < num_blocks; ++i)
      root.incrementBalCount();

    /*fprintf(stderr,
            "perfBalance: \n\ttime est: %f sec\n\tprocTime: %f sec\n\tcommTime: %f \
            sec\n\ttotal weight:%e \n\tmin edge weight:%e.\n",
            timeEst, procTime, commTime, total_weight, min_edge_weight);  // */

    do {
      changed = false;
      // balance the work between nodes and root
      for (unsigned int cpu_index = 1;
              cpu_index < cluster.getNumNodes();
              ++cpu_index) {
        Node& cpu_node = cluster.getNode(cpu_index);
        int work_deficit = cpu_node.getTotalWorkNeeded(timeEst, kConfig) -
                cpu_node.getBalCount();
        if (0 > work_deficit) { // node has extra work
          int extra_blocks = abs(work_deficit);
          for (int block_index = 0;
                  (block_index < extra_blocks) &&
                  (0 < cpu_node.getBalCount());
                  ++block_index) {
            // move block from child to parent
            cpu_node.decrementBalCount();
            root.incrementBalCount();
            changed = true;
          }
        } else if (0 < work_deficit) { //child needs more work
          work_request.setTimeDiff(timeEst - cpu_node.getBalTimeEst(0, kConfig));
          work_request.setIndex(cpu_index);
          work_queue.push(work_request);
        }
      }

      for (unsigned int cpu_index = 0;
              cpu_index < root.getNumChildren();
              ++cpu_index) {
        Node& cpu_node = root.getChild(cpu_index);
        int work_deficit = cpu_node.getTotalWorkNeeded(timeEst, kConfig) -
                cpu_node.getBalCount();
        if (0 > work_deficit) { // child has extra work
          int extra_blocks = abs(work_deficit);
          for (int block_index = 0;
                  (block_index < extra_blocks) && (0 < cpu_node.getBalCount());
                  ++block_index) {
            // move block from child to parent
            cpu_node.decrementBalCount();
            root.incrementBalCount();
            changed = true;
          }
        } else if (0 < work_deficit) { // child needs more work
          work_request.setTimeDiff(timeEst - cpu_node.getBalTimeEst(0, kConfig));
          work_request.setIndex(-1 * cpu_index); // hack so I know to give to one of root's children
          work_queue.push(work_request);
        }
      }
      /*
         at this point we have all extra blocks, and
         now we need to distribute blocks to children
         that need it
       */

      while (0 < root.getBalCount() && // there are blocks left to give
              !work_queue.empty()) { // there are requests left to fill

        // get largest request
        WorkRequest tmp = work_queue.top();
        work_queue.pop();

        double newTimeDiff = 0.0;
        int id = tmp.getIndex();
        if (id <= 0) { // local child
          id = -1 * id;
          root.decrementBalCount();
          root.getChild(id).incrementBalCount();
          newTimeDiff = timeEst - root.getChild(id).getBalTimeEst(0, kConfig);
          changed = true;
        } else { // request was from another node in cluster
          root.decrementBalCount();
          cluster.getNode(id).incrementBalCount();
          newTimeDiff = timeEst - cluster.getNode(id).getBalTimeEst(0, kConfig);
          changed = true;
        }
        // if there is still work left to do put it back on
        // the queue so that it will reorder correctly
        if (0 < newTimeDiff) {
          tmp.setTimeDiff(newTimeDiff);
          work_queue.push(tmp);
        }
      }

      // balance the work within each node
      for (unsigned int node = 0; node < cluster.getNumNodes(); ++node) {
        changed |= balanceNode(cluster.getNode(node), timeEst, kConfig);
      }
    } while (changed);
  }

  /* now that we know where everything should go, distribute the blocks */
  cluster.distributeBlocks(&decomp);

  /* the work is balanced, so we can fill the block directory */
  cluster.storeBlockLocs();
}