// stage single image line (warp size) to shared memory void ASTTranslate::stageLineToSharedMemory(ParmVarDecl *PVD, SmallVector<Stmt *, 16> &stageBody, Expr *local_offset_x, Expr *local_offset_y, Expr *global_offset_x, Expr *global_offset_y) { VarDecl *VD = KernelDeclMapShared[PVD]; HipaccAccessor *Acc = KernelDeclMapAcc[PVD]; DeclRefExpr *paramDRE = createDeclRefExpr(Ctx, PVD); Expr *LHS = accessMemShared(createDeclRefExpr(Ctx, VD), local_offset_x, local_offset_y); Expr *RHS; if (bh_variant.borderVal) { SmallVector<Stmt *, 16> bhStmts; SmallVector<CompoundStmt *, 16> bhCStmt; RHS = addBorderHandling(paramDRE, global_offset_x, global_offset_y, Acc, bhStmts, bhCStmt); // add border handling statements to stageBody for (auto stmt : bhStmts) stageBody.push_back(stmt); } else { RHS = accessMem(paramDRE, Acc, READ_ONLY, global_offset_x, global_offset_y); } stageBody.push_back(createBinaryOperator(Ctx, LHS, RHS, BO_Assign, Acc->getImage()->getType())); }
void HipaccKernel::calcConfig() { std::vector<std::pair<unsigned int, float> > occVec; unsigned int num_threads = max_threads_per_warp; bool use_shared = false; while (num_threads <= max_threads_per_block) { // allocations per thread block limits int warps_per_block = (int)ceil((float)num_threads / (float)max_threads_per_warp); int registers_per_block; if (isAMDGPU()) { // for AMD assume simple allocation strategy registers_per_block = warps_per_block * num_reg * max_threads_per_warp; } else { switch (allocation_granularity) { case BLOCK: // allocation in steps of two registers_per_block = (int)ceil((float)warps_per_block / (float)warp_register_alloc_size) * warp_register_alloc_size * num_reg * max_threads_per_warp; registers_per_block = (int)ceil((float)registers_per_block / (float)register_alloc_size) * register_alloc_size; break; case WARP: registers_per_block = (int)ceil((float)(num_reg * max_threads_per_warp) / (float)register_alloc_size) * register_alloc_size; registers_per_block *= (int)ceil((float)warps_per_block / (float)warp_register_alloc_size) * warp_register_alloc_size; break; } } unsigned int smem_used = 0; bool skip_config = false; // calculate shared memory usage for pixels staged to shared memory for (size_t i=0; i<KC->getNumImages(); ++i) { HipaccAccessor *Acc = getImgFromMapping(KC->getImgFields().data()[i]); if (useLocalMemory(Acc)) { // check if the configuration suits our assumptions about shared memory if (num_threads % 32 == 0) { // fixed shared memory for x: 3*BSX int size_x = 32; if (Acc->getSizeX() > 1) { size_x *= 3; } // add padding to avoid bank conflicts size_x += 1; // size_y = ceil((PPT*BSY+SX-1)/BSY) int threads_y = num_threads/32; int size_y = (int)ceilf((float)(getPixelsPerThread()*threads_y + Acc->getSizeY()-1)/(float)threads_y) * threads_y; smem_used += size_x*size_y * Acc->getImage()->getPixelSize(); use_shared = true; } else { skip_config = true; } } } if (skip_config || smem_used > max_total_shared_memory) { num_threads += max_threads_per_warp; continue; } int shared_memory_per_block = (int)ceil((float)smem_used / (float)shared_memory_alloc_size) * shared_memory_alloc_size; // maximum thread blocks per multiprocessor int lim_by_max_warps = std::min(max_blocks_per_multiprocessor, (unsigned int)floor((float)max_warps_per_multiprocessor / (float)warps_per_block)); int lim_by_reg, lim_by_smem; if (num_reg > max_register_per_thread) { lim_by_reg = 0; } else { if (num_reg > 0) { lim_by_reg = (int)floor((float)max_total_registers / (float)registers_per_block); } else { lim_by_reg = max_blocks_per_multiprocessor; } } if (smem_used > 0) { lim_by_smem = (int)floor((float)max_total_shared_memory / (float)shared_memory_per_block); } else { lim_by_smem = max_blocks_per_multiprocessor; } // calculate GPU occupancy int active_thread_blocks_per_multiprocessor = std::min(std::min(lim_by_max_warps, lim_by_reg), lim_by_smem); if (active_thread_blocks_per_multiprocessor > 0) max_threads_for_kernel = num_threads; int active_warps_per_multiprocessor = active_thread_blocks_per_multiprocessor * warps_per_block; //int active_threads_per_multiprocessor = active_thread_blocks_per_multiprocessor * num_threads; float occupancy = (float)active_warps_per_multiprocessor/(float)max_warps_per_multiprocessor; //int max_simultaneous_blocks_per_GPU = active_thread_blocks_per_multiprocessor*max_multiprocessors_per_GPU; occVec.push_back(std::pair<int, float>(num_threads, occupancy)); num_threads += max_threads_per_warp; } // sort configurations according to occupancy and number of threads std::sort(occVec.begin(), occVec.end(), sortOccMap()); // calculate (optimal) kernel configuration from the kernel window sizes and // ignore the limitation of maximal threads per block unsigned int num_threads_x_opt = max_threads_per_warp; unsigned int num_threads_y_opt = 1; while (num_threads_x_opt < max_size_x>>1) num_threads_x_opt += max_threads_per_warp; while (num_threads_y_opt*getPixelsPerThread() < max_size_y>>1) num_threads_y_opt += 1; // Heuristic: // 0) maximize occupancy (e.g. to hide instruction latency // 1) - minimize #threads for border handling (e.g. prefer y over x) // - prefer x over y when no border handling is necessary llvm::errs() << "\nCalculating kernel configuration for " << kernelName << "\n"; llvm::errs() << "\toptimal configuration: " << num_threads_x_opt << "x" << num_threads_y_opt << "(x" << getPixelsPerThread() << ")\n"; for (auto iter=occVec.begin(); iter<occVec.end(); ++iter) { std::pair<unsigned int, float> occMap = *iter; llvm::errs() << "\t" << occMap.first << " threads:\t" << occMap.second << "\t"; if (use_shared) { // start with warp_size or num_threads_x_opt if possible unsigned int num_threads_x = 32; unsigned int num_threads_y = occMap.first / num_threads_x; llvm::errs() << " -> " << num_threads_x << "x" << num_threads_y; } else { // make difference if we create border handling or not if (max_size_y > 1) { // start with warp_size or num_threads_x_opt if possible unsigned int num_threads_x = max_threads_per_warp; if (occMap.first >= num_threads_x_opt && occMap.first % num_threads_x_opt == 0) { num_threads_x = num_threads_x_opt; } unsigned int num_threads_y = occMap.first / num_threads_x; llvm::errs() << " -> " << num_threads_x << "x" << num_threads_y; } else { // use all threads for x direction llvm::errs() << " -> " << occMap.first << "x1"; } } llvm::errs() << "(x" << getPixelsPerThread() << ")\n"; } // fall back to default or user specified configuration unsigned int num_blocks_bh_x, num_blocks_bh_y; if (occVec.empty() || options.useKernelConfig()) { setDefaultConfig(); num_blocks_bh_x = max_size_x<=1?0:(unsigned int)ceil((float)(max_size_x>>1) / (float)num_threads_x); num_blocks_bh_y = max_size_y<=1?0:(unsigned int)ceil((float)(max_size_y>>1) / (float)(num_threads_y*getPixelsPerThread())); llvm::errs() << "Using default configuration " << num_threads_x << "x" << num_threads_y << " for kernel '" << kernelName << "'\n"; } else {