示例#1
0
// stage data to shared memory for exploration
void ASTTranslate::stageIterationToSharedMemoryExploration(SmallVector<Stmt *,
    16> &stageBody) {
  for (auto param : kernelDecl->parameters()) {
    if (KernelDeclMapShared[param]) {
      HipaccAccessor *Acc = KernelDeclMapAcc[param];

      Expr *global_offset_x = nullptr, *global_offset_y = nullptr;
      Expr *SX2;
      SmallVector<Stmt *, 16> stageIter;
      VarDecl *iter = createVarDecl(Ctx, kernelDecl, "_N", Ctx.IntTy,
          createIntegerLiteral(Ctx, 0));
      DeclStmt *iter_stmt = createDeclStmt(Ctx, iter);
      DeclRefExpr *iter_ref = createDeclRefExpr(Ctx, iter);


      if (Acc->getSizeX() > 1) {
        if (compilerOptions.exploreConfig()) {
          SX2 = tileVars.local_size_x;
        } else {
          SX2 = createIntegerLiteral(Ctx,
              static_cast<int32_t>(Kernel->getNumThreadsX()));
        }
      } else {
        SX2 = createIntegerLiteral(Ctx, 0);
      }
      global_offset_y = createBinaryOperator(Ctx, iter_ref,
          tileVars.local_size_y, BO_Mul, Ctx.IntTy);
      if (Acc->getSizeY() > 1) {
        global_offset_y = createBinaryOperator(Ctx, global_offset_y,
            createUnaryOperator(Ctx, createIntegerLiteral(Ctx,
                static_cast<int32_t>(Acc->getSizeY()/2)), UO_Minus, Ctx.IntTy),
            BO_Add, Ctx.IntTy);
      }

      // check if we need to stage right apron
      size_t num_stages_x = 0;
      if (Acc->getSizeX() > 1) {
          num_stages_x = 2;
      }

      // load row (line)
      for (size_t i=0; i<=num_stages_x; ++i) {
        // _smem[lidYRef + N*(int)blockDim.y]
        //      [(int)threadIdx.x + i*(int)blockDim.x] =
        //        Image[-SX/2 + N*(int)blockDim.y + i*(int)blockDim.x, -SY/2];
        Expr *local_offset_x = nullptr;
        if (Acc->getSizeX() > 1) {
          local_offset_x = createBinaryOperator(Ctx, createIntegerLiteral(Ctx,
                static_cast<int32_t>(i)), tileVars.local_size_x, BO_Mul,
              Ctx.IntTy);
          global_offset_x = createBinaryOperator(Ctx, local_offset_x, SX2,
              BO_Sub, Ctx.IntTy);
        }

        stageLineToSharedMemory(param, stageIter, local_offset_x,
            createBinaryOperator(Ctx, iter_ref, tileVars.local_size_y, BO_Mul,
              Ctx.IntTy), global_offset_x, global_offset_y);
      }

      // PPT + (SY-2)/BSY + 1
      DeclRefExpr *DSY = createDeclRefExpr(Ctx, createVarDecl(Ctx, kernelDecl,
            "BSY_EXPLORE", Ctx.IntTy, nullptr));

      Expr *SY;
      if (Kernel->getPixelsPerThread() > 1) {
        SY = createIntegerLiteral(Ctx,
            static_cast<int32_t>(Kernel->getPixelsPerThread()));
      } else {
        SY = createIntegerLiteral(Ctx, 1);
      }

      if (Acc->getSizeY() > 1) {
        SY = createBinaryOperator(Ctx, SY, createBinaryOperator(Ctx,
              createBinaryOperator(Ctx, createIntegerLiteral(Ctx,
                  static_cast<int32_t>(Acc->getSizeY()-2)), DSY, BO_Div,
                Ctx.IntTy), createIntegerLiteral(Ctx, 1), BO_Add, Ctx.IntTy),
            BO_Add, Ctx.IntTy);
      }
      // for (int N=0; N < PPT*BSY + (SY-2)/BSY + 1)*BSY; N++)
      ForStmt *stageLoop = createForStmt(Ctx, iter_stmt,
          createBinaryOperator(Ctx, iter_ref, SY, BO_LT, Ctx.BoolTy),
          createUnaryOperator(Ctx, iter_ref, UO_PostInc, Ctx.IntTy),
          createCompoundStmt(Ctx, stageIter));

      stageBody.push_back(stageLoop);
    }
  }
}
示例#2
0
void HipaccKernel::calcConfig() {
  std::vector<std::pair<unsigned int, float> > occVec;
  unsigned int num_threads = max_threads_per_warp;
  bool use_shared = false;

  while (num_threads <= max_threads_per_block) {
    // allocations per thread block limits
    int warps_per_block = (int)ceil((float)num_threads /
        (float)max_threads_per_warp);
    int registers_per_block;
    if (isAMDGPU()) {
      // for AMD assume simple allocation strategy
      registers_per_block = warps_per_block * num_reg * max_threads_per_warp;
    } else {
      switch (allocation_granularity) {
        case BLOCK:
          // allocation in steps of two
          registers_per_block = (int)ceil((float)warps_per_block /
              (float)warp_register_alloc_size) * warp_register_alloc_size *
            num_reg * max_threads_per_warp;
          registers_per_block = (int)ceil((float)registers_per_block /
              (float)register_alloc_size) * register_alloc_size;
          break;
        case WARP:
          registers_per_block = (int)ceil((float)(num_reg *
                max_threads_per_warp) / (float)register_alloc_size) *
            register_alloc_size;
          registers_per_block *= (int)ceil((float)warps_per_block /
              (float)warp_register_alloc_size) * warp_register_alloc_size;
          break;
      }
    }

    unsigned int smem_used = 0;
    bool skip_config = false;
    // calculate shared memory usage for pixels staged to shared memory
    for (size_t i=0; i<KC->getNumImages(); ++i) {
      HipaccAccessor *Acc = getImgFromMapping(KC->getImgFields().data()[i]);
      if (useLocalMemory(Acc)) {
        // check if the configuration suits our assumptions about shared memory
        if (num_threads % 32 == 0) {
          // fixed shared memory for x: 3*BSX
          int size_x = 32;
          if (Acc->getSizeX() > 1) {
            size_x *= 3;
          }
          // add padding to avoid bank conflicts
          size_x += 1;

          // size_y = ceil((PPT*BSY+SX-1)/BSY)
          int threads_y = num_threads/32;
          int size_y = (int)ceilf((float)(getPixelsPerThread()*threads_y +
                Acc->getSizeY()-1)/(float)threads_y) * threads_y;
          smem_used += size_x*size_y * Acc->getImage()->getPixelSize();
          use_shared = true;
        } else {
          skip_config = true;
        }
      }
    }

    if (skip_config || smem_used > max_total_shared_memory) {
      num_threads += max_threads_per_warp;
      continue;
    }

    int shared_memory_per_block = (int)ceil((float)smem_used /
        (float)shared_memory_alloc_size) * shared_memory_alloc_size;

    // maximum thread blocks per multiprocessor
    int lim_by_max_warps = std::min(max_blocks_per_multiprocessor, (unsigned
          int)floor((float)max_warps_per_multiprocessor /
            (float)warps_per_block));
    int lim_by_reg, lim_by_smem;
    if (num_reg > max_register_per_thread) {
      lim_by_reg = 0;
    } else {
      if (num_reg > 0) {
        lim_by_reg = (int)floor((float)max_total_registers /
            (float)registers_per_block);
      } else {
        lim_by_reg = max_blocks_per_multiprocessor;
      }
    }
    if (smem_used > 0) {
      lim_by_smem = (int)floor((float)max_total_shared_memory /
          (float)shared_memory_per_block);
    } else {
      lim_by_smem = max_blocks_per_multiprocessor;
    }

    // calculate GPU occupancy
    int active_thread_blocks_per_multiprocessor = std::min(std::min(lim_by_max_warps, lim_by_reg), lim_by_smem);
    if (active_thread_blocks_per_multiprocessor > 0) max_threads_for_kernel = num_threads;
    int active_warps_per_multiprocessor = active_thread_blocks_per_multiprocessor * warps_per_block;
    //int active_threads_per_multiprocessor = active_thread_blocks_per_multiprocessor * num_threads;
    float occupancy = (float)active_warps_per_multiprocessor/(float)max_warps_per_multiprocessor;
    //int max_simultaneous_blocks_per_GPU = active_thread_blocks_per_multiprocessor*max_multiprocessors_per_GPU;

    occVec.push_back(std::pair<int, float>(num_threads, occupancy));
    num_threads += max_threads_per_warp;
  }

  // sort configurations according to occupancy and number of threads
  std::sort(occVec.begin(), occVec.end(), sortOccMap());

  // calculate (optimal) kernel configuration from the kernel window sizes and
  // ignore the limitation of maximal threads per block
  unsigned int num_threads_x_opt = max_threads_per_warp;
  unsigned int num_threads_y_opt = 1;
  while (num_threads_x_opt < max_size_x>>1)
    num_threads_x_opt += max_threads_per_warp;
  while (num_threads_y_opt*getPixelsPerThread() < max_size_y>>1)
    num_threads_y_opt += 1;

  // Heuristic:
  // 0) maximize occupancy (e.g. to hide instruction latency
  // 1) - minimize #threads for border handling (e.g. prefer y over x)
  //    - prefer x over y when no border handling is necessary
  llvm::errs() << "\nCalculating kernel configuration for " << kernelName << "\n";
  llvm::errs() << "\toptimal configuration: " << num_threads_x_opt << "x" <<
    num_threads_y_opt << "(x" << getPixelsPerThread() << ")\n";
  for (auto iter=occVec.begin(); iter<occVec.end(); ++iter) {
    std::pair<unsigned int, float> occMap = *iter;
    llvm::errs() << "\t" << occMap.first << " threads:\t" << occMap.second << "\t";

    if (use_shared) {
      // start with warp_size or num_threads_x_opt if possible
      unsigned int num_threads_x = 32;
      unsigned int num_threads_y = occMap.first / num_threads_x;
      llvm::errs() << " -> " << num_threads_x << "x" << num_threads_y;
    } else {
      // make difference if we create border handling or not
      if (max_size_y > 1) {
        // start with warp_size or num_threads_x_opt if possible
        unsigned int num_threads_x = max_threads_per_warp;
        if (occMap.first >= num_threads_x_opt && occMap.first % num_threads_x_opt == 0) {
          num_threads_x = num_threads_x_opt;
        }
        unsigned int num_threads_y = occMap.first / num_threads_x;
        llvm::errs() << " -> " << num_threads_x << "x" << num_threads_y;
      } else {
        // use all threads for x direction
        llvm::errs() << " -> " << occMap.first << "x1";
      }
    }
    llvm::errs() << "(x" << getPixelsPerThread() << ")\n";
  }


  // fall back to default or user specified configuration
  unsigned int num_blocks_bh_x, num_blocks_bh_y;
  if (occVec.empty() || options.useKernelConfig()) {
    setDefaultConfig();
    num_blocks_bh_x = max_size_x<=1?0:(unsigned int)ceil((float)(max_size_x>>1) / (float)num_threads_x);
    num_blocks_bh_y = max_size_y<=1?0:(unsigned int)ceil((float)(max_size_y>>1) / (float)(num_threads_y*getPixelsPerThread()));
    llvm::errs() << "Using default configuration " << num_threads_x << "x"
                 << num_threads_y << " for kernel '" << kernelName << "'\n";
  } else {
示例#3
0
// stage iteration p to shared memory
void ASTTranslate::stageIterationToSharedMemory(SmallVector<Stmt *, 16>
    &stageBody, int p) {
  for (auto param : kernelDecl->parameters()) {
    if (KernelDeclMapShared[param]) {
      HipaccAccessor *Acc = KernelDeclMapAcc[param];

      // check if the bottom apron has to be fetched
      if (p>=static_cast<int>(Kernel->getPixelsPerThread())) {
        int p_add = static_cast<int>(ceilf((Acc->getSizeY()-1) /
              static_cast<float>(Kernel->getNumThreadsY())));
        if (p>=static_cast<int>(Kernel->getPixelsPerThread())+p_add) continue;
      }

      Expr *global_offset_x = nullptr, *global_offset_y = nullptr;
      Expr *SX2;

      if (Acc->getSizeX() > 1) {
        if (compilerOptions.exploreConfig()) {
          SX2 = tileVars.local_size_x;
        } else {
          SX2 = createIntegerLiteral(Ctx,
              static_cast<int32_t>(Kernel->getNumThreadsX()));
        }
      } else {
        SX2 = createIntegerLiteral(Ctx, 0);
      }
      if (Acc->getSizeY() > 1) {
        global_offset_y = createParenExpr(Ctx, createUnaryOperator(Ctx,
              createIntegerLiteral(Ctx,
                static_cast<int32_t>(Acc->getSizeY()/2)), UO_Minus, Ctx.IntTy));
      } else {
        global_offset_y = nullptr;
      }

      if (compilerOptions.allowMisAlignedAccess()) {
        Expr *local_offset_x = nullptr;
        // load first half line
        if (Acc->getSizeX() > 1) {
          local_offset_x = createIntegerLiteral(Ctx, static_cast<int32_t>(0));
          global_offset_x = createParenExpr(Ctx, createUnaryOperator(Ctx,
                createIntegerLiteral(Ctx,
                  static_cast<int32_t>(Acc->getSizeX()/2)), UO_Minus, Ctx.IntTy));
        }
        stageLineToSharedMemory(param, stageBody, local_offset_x, nullptr,
            global_offset_x, global_offset_y);
        // load line second half (partially overlap)
        if (Acc->getSizeX() > 1) {
          local_offset_x = createIntegerLiteral(Ctx,
              static_cast<int32_t>(Acc->getSizeX()/2)*2);
          global_offset_x = createParenExpr(Ctx, createUnaryOperator(Ctx,
                createIntegerLiteral(Ctx,
                  static_cast<int32_t>(Acc->getSizeX()/2)), UO_Plus, Ctx.IntTy));
        }
        stageLineToSharedMemory(param, stageBody, local_offset_x, nullptr,
            global_offset_x, global_offset_y);
      } else {
        // check if we need to stage right apron
        size_t num_stages_x = 0;
        if (Acc->getSizeX() > 1) {
            num_stages_x = 2;
        }

        // load row (line)
        for (size_t i=0; i<=num_stages_x; ++i) {
          // _smem[lidYRef][(int)threadIdx.x + i*(int)blockDim.x] =
          //        Image[-SX/2 + i*(int)blockDim.x, -SY/2];
          Expr *local_offset_x = nullptr;
          if (Acc->getSizeX() > 1) {
            local_offset_x = createBinaryOperator(Ctx, createIntegerLiteral(Ctx,
                  static_cast<int32_t>(i)), tileVars.local_size_x, BO_Mul,
                Ctx.IntTy);
            global_offset_x = createBinaryOperator(Ctx, local_offset_x, SX2,
                BO_Sub, Ctx.IntTy);
          }

          stageLineToSharedMemory(param, stageBody, local_offset_x, nullptr,
              global_offset_x, global_offset_y);
        }
      }
    }
  }
}