// stage data to shared memory for exploration void ASTTranslate::stageIterationToSharedMemoryExploration(SmallVector<Stmt *, 16> &stageBody) { for (auto param : kernelDecl->parameters()) { if (KernelDeclMapShared[param]) { HipaccAccessor *Acc = KernelDeclMapAcc[param]; Expr *global_offset_x = nullptr, *global_offset_y = nullptr; Expr *SX2; SmallVector<Stmt *, 16> stageIter; VarDecl *iter = createVarDecl(Ctx, kernelDecl, "_N", Ctx.IntTy, createIntegerLiteral(Ctx, 0)); DeclStmt *iter_stmt = createDeclStmt(Ctx, iter); DeclRefExpr *iter_ref = createDeclRefExpr(Ctx, iter); if (Acc->getSizeX() > 1) { if (compilerOptions.exploreConfig()) { SX2 = tileVars.local_size_x; } else { SX2 = createIntegerLiteral(Ctx, static_cast<int32_t>(Kernel->getNumThreadsX())); } } else { SX2 = createIntegerLiteral(Ctx, 0); } global_offset_y = createBinaryOperator(Ctx, iter_ref, tileVars.local_size_y, BO_Mul, Ctx.IntTy); if (Acc->getSizeY() > 1) { global_offset_y = createBinaryOperator(Ctx, global_offset_y, createUnaryOperator(Ctx, createIntegerLiteral(Ctx, static_cast<int32_t>(Acc->getSizeY()/2)), UO_Minus, Ctx.IntTy), BO_Add, Ctx.IntTy); } // check if we need to stage right apron size_t num_stages_x = 0; if (Acc->getSizeX() > 1) { num_stages_x = 2; } // load row (line) for (size_t i=0; i<=num_stages_x; ++i) { // _smem[lidYRef + N*(int)blockDim.y] // [(int)threadIdx.x + i*(int)blockDim.x] = // Image[-SX/2 + N*(int)blockDim.y + i*(int)blockDim.x, -SY/2]; Expr *local_offset_x = nullptr; if (Acc->getSizeX() > 1) { local_offset_x = createBinaryOperator(Ctx, createIntegerLiteral(Ctx, static_cast<int32_t>(i)), tileVars.local_size_x, BO_Mul, Ctx.IntTy); global_offset_x = createBinaryOperator(Ctx, local_offset_x, SX2, BO_Sub, Ctx.IntTy); } stageLineToSharedMemory(param, stageIter, local_offset_x, createBinaryOperator(Ctx, iter_ref, tileVars.local_size_y, BO_Mul, Ctx.IntTy), global_offset_x, global_offset_y); } // PPT + (SY-2)/BSY + 1 DeclRefExpr *DSY = createDeclRefExpr(Ctx, createVarDecl(Ctx, kernelDecl, "BSY_EXPLORE", Ctx.IntTy, nullptr)); Expr *SY; if (Kernel->getPixelsPerThread() > 1) { SY = createIntegerLiteral(Ctx, static_cast<int32_t>(Kernel->getPixelsPerThread())); } else { SY = createIntegerLiteral(Ctx, 1); } if (Acc->getSizeY() > 1) { SY = createBinaryOperator(Ctx, SY, createBinaryOperator(Ctx, createBinaryOperator(Ctx, createIntegerLiteral(Ctx, static_cast<int32_t>(Acc->getSizeY()-2)), DSY, BO_Div, Ctx.IntTy), createIntegerLiteral(Ctx, 1), BO_Add, Ctx.IntTy), BO_Add, Ctx.IntTy); } // for (int N=0; N < PPT*BSY + (SY-2)/BSY + 1)*BSY; N++) ForStmt *stageLoop = createForStmt(Ctx, iter_stmt, createBinaryOperator(Ctx, iter_ref, SY, BO_LT, Ctx.BoolTy), createUnaryOperator(Ctx, iter_ref, UO_PostInc, Ctx.IntTy), createCompoundStmt(Ctx, stageIter)); stageBody.push_back(stageLoop); } } }
void HipaccKernel::calcConfig() { std::vector<std::pair<unsigned int, float> > occVec; unsigned int num_threads = max_threads_per_warp; bool use_shared = false; while (num_threads <= max_threads_per_block) { // allocations per thread block limits int warps_per_block = (int)ceil((float)num_threads / (float)max_threads_per_warp); int registers_per_block; if (isAMDGPU()) { // for AMD assume simple allocation strategy registers_per_block = warps_per_block * num_reg * max_threads_per_warp; } else { switch (allocation_granularity) { case BLOCK: // allocation in steps of two registers_per_block = (int)ceil((float)warps_per_block / (float)warp_register_alloc_size) * warp_register_alloc_size * num_reg * max_threads_per_warp; registers_per_block = (int)ceil((float)registers_per_block / (float)register_alloc_size) * register_alloc_size; break; case WARP: registers_per_block = (int)ceil((float)(num_reg * max_threads_per_warp) / (float)register_alloc_size) * register_alloc_size; registers_per_block *= (int)ceil((float)warps_per_block / (float)warp_register_alloc_size) * warp_register_alloc_size; break; } } unsigned int smem_used = 0; bool skip_config = false; // calculate shared memory usage for pixels staged to shared memory for (size_t i=0; i<KC->getNumImages(); ++i) { HipaccAccessor *Acc = getImgFromMapping(KC->getImgFields().data()[i]); if (useLocalMemory(Acc)) { // check if the configuration suits our assumptions about shared memory if (num_threads % 32 == 0) { // fixed shared memory for x: 3*BSX int size_x = 32; if (Acc->getSizeX() > 1) { size_x *= 3; } // add padding to avoid bank conflicts size_x += 1; // size_y = ceil((PPT*BSY+SX-1)/BSY) int threads_y = num_threads/32; int size_y = (int)ceilf((float)(getPixelsPerThread()*threads_y + Acc->getSizeY()-1)/(float)threads_y) * threads_y; smem_used += size_x*size_y * Acc->getImage()->getPixelSize(); use_shared = true; } else { skip_config = true; } } } if (skip_config || smem_used > max_total_shared_memory) { num_threads += max_threads_per_warp; continue; } int shared_memory_per_block = (int)ceil((float)smem_used / (float)shared_memory_alloc_size) * shared_memory_alloc_size; // maximum thread blocks per multiprocessor int lim_by_max_warps = std::min(max_blocks_per_multiprocessor, (unsigned int)floor((float)max_warps_per_multiprocessor / (float)warps_per_block)); int lim_by_reg, lim_by_smem; if (num_reg > max_register_per_thread) { lim_by_reg = 0; } else { if (num_reg > 0) { lim_by_reg = (int)floor((float)max_total_registers / (float)registers_per_block); } else { lim_by_reg = max_blocks_per_multiprocessor; } } if (smem_used > 0) { lim_by_smem = (int)floor((float)max_total_shared_memory / (float)shared_memory_per_block); } else { lim_by_smem = max_blocks_per_multiprocessor; } // calculate GPU occupancy int active_thread_blocks_per_multiprocessor = std::min(std::min(lim_by_max_warps, lim_by_reg), lim_by_smem); if (active_thread_blocks_per_multiprocessor > 0) max_threads_for_kernel = num_threads; int active_warps_per_multiprocessor = active_thread_blocks_per_multiprocessor * warps_per_block; //int active_threads_per_multiprocessor = active_thread_blocks_per_multiprocessor * num_threads; float occupancy = (float)active_warps_per_multiprocessor/(float)max_warps_per_multiprocessor; //int max_simultaneous_blocks_per_GPU = active_thread_blocks_per_multiprocessor*max_multiprocessors_per_GPU; occVec.push_back(std::pair<int, float>(num_threads, occupancy)); num_threads += max_threads_per_warp; } // sort configurations according to occupancy and number of threads std::sort(occVec.begin(), occVec.end(), sortOccMap()); // calculate (optimal) kernel configuration from the kernel window sizes and // ignore the limitation of maximal threads per block unsigned int num_threads_x_opt = max_threads_per_warp; unsigned int num_threads_y_opt = 1; while (num_threads_x_opt < max_size_x>>1) num_threads_x_opt += max_threads_per_warp; while (num_threads_y_opt*getPixelsPerThread() < max_size_y>>1) num_threads_y_opt += 1; // Heuristic: // 0) maximize occupancy (e.g. to hide instruction latency // 1) - minimize #threads for border handling (e.g. prefer y over x) // - prefer x over y when no border handling is necessary llvm::errs() << "\nCalculating kernel configuration for " << kernelName << "\n"; llvm::errs() << "\toptimal configuration: " << num_threads_x_opt << "x" << num_threads_y_opt << "(x" << getPixelsPerThread() << ")\n"; for (auto iter=occVec.begin(); iter<occVec.end(); ++iter) { std::pair<unsigned int, float> occMap = *iter; llvm::errs() << "\t" << occMap.first << " threads:\t" << occMap.second << "\t"; if (use_shared) { // start with warp_size or num_threads_x_opt if possible unsigned int num_threads_x = 32; unsigned int num_threads_y = occMap.first / num_threads_x; llvm::errs() << " -> " << num_threads_x << "x" << num_threads_y; } else { // make difference if we create border handling or not if (max_size_y > 1) { // start with warp_size or num_threads_x_opt if possible unsigned int num_threads_x = max_threads_per_warp; if (occMap.first >= num_threads_x_opt && occMap.first % num_threads_x_opt == 0) { num_threads_x = num_threads_x_opt; } unsigned int num_threads_y = occMap.first / num_threads_x; llvm::errs() << " -> " << num_threads_x << "x" << num_threads_y; } else { // use all threads for x direction llvm::errs() << " -> " << occMap.first << "x1"; } } llvm::errs() << "(x" << getPixelsPerThread() << ")\n"; } // fall back to default or user specified configuration unsigned int num_blocks_bh_x, num_blocks_bh_y; if (occVec.empty() || options.useKernelConfig()) { setDefaultConfig(); num_blocks_bh_x = max_size_x<=1?0:(unsigned int)ceil((float)(max_size_x>>1) / (float)num_threads_x); num_blocks_bh_y = max_size_y<=1?0:(unsigned int)ceil((float)(max_size_y>>1) / (float)(num_threads_y*getPixelsPerThread())); llvm::errs() << "Using default configuration " << num_threads_x << "x" << num_threads_y << " for kernel '" << kernelName << "'\n"; } else {
// stage iteration p to shared memory void ASTTranslate::stageIterationToSharedMemory(SmallVector<Stmt *, 16> &stageBody, int p) { for (auto param : kernelDecl->parameters()) { if (KernelDeclMapShared[param]) { HipaccAccessor *Acc = KernelDeclMapAcc[param]; // check if the bottom apron has to be fetched if (p>=static_cast<int>(Kernel->getPixelsPerThread())) { int p_add = static_cast<int>(ceilf((Acc->getSizeY()-1) / static_cast<float>(Kernel->getNumThreadsY()))); if (p>=static_cast<int>(Kernel->getPixelsPerThread())+p_add) continue; } Expr *global_offset_x = nullptr, *global_offset_y = nullptr; Expr *SX2; if (Acc->getSizeX() > 1) { if (compilerOptions.exploreConfig()) { SX2 = tileVars.local_size_x; } else { SX2 = createIntegerLiteral(Ctx, static_cast<int32_t>(Kernel->getNumThreadsX())); } } else { SX2 = createIntegerLiteral(Ctx, 0); } if (Acc->getSizeY() > 1) { global_offset_y = createParenExpr(Ctx, createUnaryOperator(Ctx, createIntegerLiteral(Ctx, static_cast<int32_t>(Acc->getSizeY()/2)), UO_Minus, Ctx.IntTy)); } else { global_offset_y = nullptr; } if (compilerOptions.allowMisAlignedAccess()) { Expr *local_offset_x = nullptr; // load first half line if (Acc->getSizeX() > 1) { local_offset_x = createIntegerLiteral(Ctx, static_cast<int32_t>(0)); global_offset_x = createParenExpr(Ctx, createUnaryOperator(Ctx, createIntegerLiteral(Ctx, static_cast<int32_t>(Acc->getSizeX()/2)), UO_Minus, Ctx.IntTy)); } stageLineToSharedMemory(param, stageBody, local_offset_x, nullptr, global_offset_x, global_offset_y); // load line second half (partially overlap) if (Acc->getSizeX() > 1) { local_offset_x = createIntegerLiteral(Ctx, static_cast<int32_t>(Acc->getSizeX()/2)*2); global_offset_x = createParenExpr(Ctx, createUnaryOperator(Ctx, createIntegerLiteral(Ctx, static_cast<int32_t>(Acc->getSizeX()/2)), UO_Plus, Ctx.IntTy)); } stageLineToSharedMemory(param, stageBody, local_offset_x, nullptr, global_offset_x, global_offset_y); } else { // check if we need to stage right apron size_t num_stages_x = 0; if (Acc->getSizeX() > 1) { num_stages_x = 2; } // load row (line) for (size_t i=0; i<=num_stages_x; ++i) { // _smem[lidYRef][(int)threadIdx.x + i*(int)blockDim.x] = // Image[-SX/2 + i*(int)blockDim.x, -SY/2]; Expr *local_offset_x = nullptr; if (Acc->getSizeX() > 1) { local_offset_x = createBinaryOperator(Ctx, createIntegerLiteral(Ctx, static_cast<int32_t>(i)), tileVars.local_size_x, BO_Mul, Ctx.IntTy); global_offset_x = createBinaryOperator(Ctx, local_offset_x, SX2, BO_Sub, Ctx.IntTy); } stageLineToSharedMemory(param, stageBody, local_offset_x, nullptr, global_offset_x, global_offset_y); } } } } }