示例#1
0
// stage single image line (warp size) to shared memory
void ASTTranslate::stageLineToSharedMemory(ParmVarDecl *PVD,
    SmallVector<Stmt *, 16> &stageBody, Expr *local_offset_x, Expr
    *local_offset_y, Expr *global_offset_x, Expr *global_offset_y) {
  VarDecl *VD = KernelDeclMapShared[PVD];
  HipaccAccessor *Acc = KernelDeclMapAcc[PVD];
  DeclRefExpr *paramDRE = createDeclRefExpr(Ctx, PVD);

  Expr *LHS = accessMemShared(createDeclRefExpr(Ctx, VD), local_offset_x,
      local_offset_y);

  Expr *RHS;
  if (bh_variant.borderVal) {
    SmallVector<Stmt *, 16> bhStmts;
    SmallVector<CompoundStmt *, 16> bhCStmt;
    RHS = addBorderHandling(paramDRE, global_offset_x, global_offset_y, Acc,
        bhStmts, bhCStmt);

    // add border handling statements to stageBody
    for (auto stmt : bhStmts)
      stageBody.push_back(stmt);
  } else {
    RHS = accessMem(paramDRE, Acc, READ_ONLY, global_offset_x, global_offset_y);
  }

  stageBody.push_back(createBinaryOperator(Ctx, LHS, RHS, BO_Assign,
        Acc->getImage()->getType()));
}
示例#2
0
void HipaccKernel::calcConfig() {
  std::vector<std::pair<unsigned int, float> > occVec;
  unsigned int num_threads = max_threads_per_warp;
  bool use_shared = false;

  while (num_threads <= max_threads_per_block) {
    // allocations per thread block limits
    int warps_per_block = (int)ceil((float)num_threads /
        (float)max_threads_per_warp);
    int registers_per_block;
    if (isAMDGPU()) {
      // for AMD assume simple allocation strategy
      registers_per_block = warps_per_block * num_reg * max_threads_per_warp;
    } else {
      switch (allocation_granularity) {
        case BLOCK:
          // allocation in steps of two
          registers_per_block = (int)ceil((float)warps_per_block /
              (float)warp_register_alloc_size) * warp_register_alloc_size *
            num_reg * max_threads_per_warp;
          registers_per_block = (int)ceil((float)registers_per_block /
              (float)register_alloc_size) * register_alloc_size;
          break;
        case WARP:
          registers_per_block = (int)ceil((float)(num_reg *
                max_threads_per_warp) / (float)register_alloc_size) *
            register_alloc_size;
          registers_per_block *= (int)ceil((float)warps_per_block /
              (float)warp_register_alloc_size) * warp_register_alloc_size;
          break;
      }
    }

    unsigned int smem_used = 0;
    bool skip_config = false;
    // calculate shared memory usage for pixels staged to shared memory
    for (size_t i=0; i<KC->getNumImages(); ++i) {
      HipaccAccessor *Acc = getImgFromMapping(KC->getImgFields().data()[i]);
      if (useLocalMemory(Acc)) {
        // check if the configuration suits our assumptions about shared memory
        if (num_threads % 32 == 0) {
          // fixed shared memory for x: 3*BSX
          int size_x = 32;
          if (Acc->getSizeX() > 1) {
            size_x *= 3;
          }
          // add padding to avoid bank conflicts
          size_x += 1;

          // size_y = ceil((PPT*BSY+SX-1)/BSY)
          int threads_y = num_threads/32;
          int size_y = (int)ceilf((float)(getPixelsPerThread()*threads_y +
                Acc->getSizeY()-1)/(float)threads_y) * threads_y;
          smem_used += size_x*size_y * Acc->getImage()->getPixelSize();
          use_shared = true;
        } else {
          skip_config = true;
        }
      }
    }

    if (skip_config || smem_used > max_total_shared_memory) {
      num_threads += max_threads_per_warp;
      continue;
    }

    int shared_memory_per_block = (int)ceil((float)smem_used /
        (float)shared_memory_alloc_size) * shared_memory_alloc_size;

    // maximum thread blocks per multiprocessor
    int lim_by_max_warps = std::min(max_blocks_per_multiprocessor, (unsigned
          int)floor((float)max_warps_per_multiprocessor /
            (float)warps_per_block));
    int lim_by_reg, lim_by_smem;
    if (num_reg > max_register_per_thread) {
      lim_by_reg = 0;
    } else {
      if (num_reg > 0) {
        lim_by_reg = (int)floor((float)max_total_registers /
            (float)registers_per_block);
      } else {
        lim_by_reg = max_blocks_per_multiprocessor;
      }
    }
    if (smem_used > 0) {
      lim_by_smem = (int)floor((float)max_total_shared_memory /
          (float)shared_memory_per_block);
    } else {
      lim_by_smem = max_blocks_per_multiprocessor;
    }

    // calculate GPU occupancy
    int active_thread_blocks_per_multiprocessor = std::min(std::min(lim_by_max_warps, lim_by_reg), lim_by_smem);
    if (active_thread_blocks_per_multiprocessor > 0) max_threads_for_kernel = num_threads;
    int active_warps_per_multiprocessor = active_thread_blocks_per_multiprocessor * warps_per_block;
    //int active_threads_per_multiprocessor = active_thread_blocks_per_multiprocessor * num_threads;
    float occupancy = (float)active_warps_per_multiprocessor/(float)max_warps_per_multiprocessor;
    //int max_simultaneous_blocks_per_GPU = active_thread_blocks_per_multiprocessor*max_multiprocessors_per_GPU;

    occVec.push_back(std::pair<int, float>(num_threads, occupancy));
    num_threads += max_threads_per_warp;
  }

  // sort configurations according to occupancy and number of threads
  std::sort(occVec.begin(), occVec.end(), sortOccMap());

  // calculate (optimal) kernel configuration from the kernel window sizes and
  // ignore the limitation of maximal threads per block
  unsigned int num_threads_x_opt = max_threads_per_warp;
  unsigned int num_threads_y_opt = 1;
  while (num_threads_x_opt < max_size_x>>1)
    num_threads_x_opt += max_threads_per_warp;
  while (num_threads_y_opt*getPixelsPerThread() < max_size_y>>1)
    num_threads_y_opt += 1;

  // Heuristic:
  // 0) maximize occupancy (e.g. to hide instruction latency
  // 1) - minimize #threads for border handling (e.g. prefer y over x)
  //    - prefer x over y when no border handling is necessary
  llvm::errs() << "\nCalculating kernel configuration for " << kernelName << "\n";
  llvm::errs() << "\toptimal configuration: " << num_threads_x_opt << "x" <<
    num_threads_y_opt << "(x" << getPixelsPerThread() << ")\n";
  for (auto iter=occVec.begin(); iter<occVec.end(); ++iter) {
    std::pair<unsigned int, float> occMap = *iter;
    llvm::errs() << "\t" << occMap.first << " threads:\t" << occMap.second << "\t";

    if (use_shared) {
      // start with warp_size or num_threads_x_opt if possible
      unsigned int num_threads_x = 32;
      unsigned int num_threads_y = occMap.first / num_threads_x;
      llvm::errs() << " -> " << num_threads_x << "x" << num_threads_y;
    } else {
      // make difference if we create border handling or not
      if (max_size_y > 1) {
        // start with warp_size or num_threads_x_opt if possible
        unsigned int num_threads_x = max_threads_per_warp;
        if (occMap.first >= num_threads_x_opt && occMap.first % num_threads_x_opt == 0) {
          num_threads_x = num_threads_x_opt;
        }
        unsigned int num_threads_y = occMap.first / num_threads_x;
        llvm::errs() << " -> " << num_threads_x << "x" << num_threads_y;
      } else {
        // use all threads for x direction
        llvm::errs() << " -> " << occMap.first << "x1";
      }
    }
    llvm::errs() << "(x" << getPixelsPerThread() << ")\n";
  }


  // fall back to default or user specified configuration
  unsigned int num_blocks_bh_x, num_blocks_bh_y;
  if (occVec.empty() || options.useKernelConfig()) {
    setDefaultConfig();
    num_blocks_bh_x = max_size_x<=1?0:(unsigned int)ceil((float)(max_size_x>>1) / (float)num_threads_x);
    num_blocks_bh_y = max_size_y<=1?0:(unsigned int)ceil((float)(max_size_y>>1) / (float)(num_threads_y*getPixelsPerThread()));
    llvm::errs() << "Using default configuration " << num_threads_x << "x"
                 << num_threads_y << " for kernel '" << kernelName << "'\n";
  } else {