void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) { BasicBlock::iterator LoopBody; SetVector<Value *> Values; SetVector<Value *> IVS; std::vector<int> NumIterations; PTXGenerator::ValueToValueMapTy VMap; assert(!GPUTriple.empty() && "Target triple should be set properly for GPGPU code generation."); PTXGenerator PTXGen(Builder, P, GPUTriple); // Get original IVS and ScopStmt unsigned TiledLoopDepth, NonPLoopDepth; const clast_stmt *InnerStmt = getScheduleInfo(F, NumIterations, TiledLoopDepth, NonPLoopDepth); const clast_stmt *TmpStmt; const clast_user_stmt *U; const clast_for *InnerFor; if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) { InnerFor = (const clast_for *)InnerStmt; TmpStmt = InnerFor->body; } else TmpStmt = InnerStmt; U = (const clast_user_stmt *)TmpStmt; ScopStmt *Statement = (ScopStmt *)U->statement->usr; for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) { const Value *IV = Statement->getInductionVariableForDimension(i); IVS.insert(const_cast<Value *>(IV)); } unsigned OutBytes; Values = getGPUValues(OutBytes); PTXGen.setOutputBytes(OutBytes); PTXGen.startGeneration(Values, IVS, VMap, &LoopBody); BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); Builder.SetInsertPoint(LoopBody); BasicBlock *AfterBB = 0; if (NonPLoopDepth) { Value *LowerBound, *UpperBound, *IV, *Stride; Type *IntPtrTy = getIntPtrTy(); LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy); UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy); Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride)); IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB, CmpInst::ICMP_SLE); const Value *OldIV_ = Statement->getInductionVariableForDimension(2); Value *OldIV = const_cast<Value *>(OldIV_); VMap.insert(std::make_pair<Value *, Value *>(OldIV, IV)); } updateWithValueMap(VMap); BlockGenerator::generate(Builder, *Statement, ValueMap, P); if (AfterBB) Builder.SetInsertPoint(AfterBB->begin()); // FIXME: The replacement of the host base address with the parameter of ptx // subfunction should have been done by updateWithValueMap. We use the // following codes to avoid affecting other parts of Polly. This should be // fixed later. Function *FN = Builder.GetInsertBlock()->getParent(); for (unsigned j = 0; j < Values.size(); j++) { Value *baseAddr = Values[j]; for (Function::iterator B = FN->begin(); B != FN->end(); ++B) { for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I) I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]); } } Builder.SetInsertPoint(AfterLoop); PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1], NumIterations[2], NumIterations[3]); PTXGen.finishGeneration(FN); }
void PTXGenerator::createSubfunction(SetVector<Value *> &UsedValues, SetVector<Value *> &OriginalIVS, PTXGenerator::ValueToValueMapTy &VMap, Function **SubFunction) { Function *FN = createSubfunctionDefinition(UsedValues.size()); Module *M = getModule(); LLVMContext &Context = FN->getContext(); IntegerType *Ty = Builder.getInt64Ty(); // Store the previous basic block. BasicBlock *PrevBB = Builder.GetInsertBlock(); // Create basic blocks. BasicBlock *HeaderBB = BasicBlock::Create(Context, "ptx.setup", FN); BasicBlock *ExitBB = BasicBlock::Create(Context, "ptx.exit", FN); BasicBlock *BodyBB = BasicBlock::Create(Context, "ptx.loop_body", FN); DominatorTree &DT = P->getAnalysis<DominatorTree>(); DT.addNewBlock(HeaderBB, PrevBB); DT.addNewBlock(ExitBB, HeaderBB); DT.addNewBlock(BodyBB, HeaderBB); Builder.SetInsertPoint(HeaderBB); // Insert VMap items with maps of array base address on the host to base // address on the device. Function::arg_iterator AI = FN->arg_begin(); for (unsigned j = 0; j < UsedValues.size(); j++) { Value *BaseAddr = UsedValues[j]; Type *ArrayTy = BaseAddr->getType(); Value *Param = Builder.CreateBitCast(AI, ArrayTy); VMap.insert(std::make_pair<Value *, Value *>(BaseAddr, Param)); AI++; } // FIXME: These intrinsics should be inserted on-demand. However, we insert // them all currently for simplicity. Function *GetNctaidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_x); Function *GetNctaidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_y); Function *GetCtaidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_x); Function *GetCtaidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_y); Function *GetNtidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_x); Function *GetNtidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_y); Function *GetTidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_x); Function *GetTidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_y); Value *GridWidth = Builder.CreateCall(GetNctaidX); GridWidth = Builder.CreateIntCast(GridWidth, Ty, false); Value *GridHeight = Builder.CreateCall(GetNctaidY); GridHeight = Builder.CreateIntCast(GridHeight, Ty, false); Value *BlockWidth = Builder.CreateCall(GetNtidX); BlockWidth = Builder.CreateIntCast(BlockWidth, Ty, false); Value *BlockHeight = Builder.CreateCall(GetNtidY); BlockHeight = Builder.CreateIntCast(BlockHeight, Ty, false); Value *BIDx = Builder.CreateCall(GetCtaidX); BIDx = Builder.CreateIntCast(BIDx, Ty, false); Value *BIDy = Builder.CreateCall(GetCtaidY); BIDy = Builder.CreateIntCast(BIDy, Ty, false); Value *TIDx = Builder.CreateCall(GetTidX); TIDx = Builder.CreateIntCast(TIDx, Ty, false); Value *TIDy = Builder.CreateCall(GetTidY); TIDy = Builder.CreateIntCast(TIDy, Ty, false); Builder.CreateBr(BodyBB); Builder.SetInsertPoint(BodyBB); unsigned NumDims = OriginalIVS.size(); std::vector<Value *> Substitutions; Value *BlockID, *ThreadID; switch (NumDims) { case 1: { Value *BlockSize = Builder.CreateMul(BlockWidth, BlockHeight, "p_gpu_blocksize"); BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i"); BlockID = Builder.CreateAdd(BlockID, BIDx); BlockID = Builder.CreateMul(BlockID, BlockSize); ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j"); ThreadID = Builder.CreateAdd(ThreadID, TIDx); ThreadID = Builder.CreateAdd(ThreadID, BlockID); Substitutions.push_back(ThreadID); break; } case 2: { BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i"); BlockID = Builder.CreateAdd(BlockID, BIDx); Substitutions.push_back(BlockID); ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j"); ThreadID = Builder.CreateAdd(ThreadID, TIDx); Substitutions.push_back(ThreadID); break; } case 3: { BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i"); BlockID = Builder.CreateAdd(BlockID, BIDx); Substitutions.push_back(BlockID); Substitutions.push_back(TIDy); Substitutions.push_back(TIDx); break; } case 4: { Substitutions.push_back(BIDy); Substitutions.push_back(BIDx); Substitutions.push_back(TIDy); Substitutions.push_back(TIDx); break; } default: assert(true && "We cannot transform parallel loops whose depth is larger than 4."); return; } assert(OriginalIVS.size() == Substitutions.size() && "The size of IVS should be equal to the size of substitutions."); for (unsigned i = 0; i < OriginalIVS.size(); ++i) { VMap.insert( std::make_pair<Value *, Value *>(OriginalIVS[i], Substitutions[i])); } Builder.CreateBr(ExitBB); Builder.SetInsertPoint(--Builder.GetInsertPoint()); BasicBlock::iterator LoopBody = Builder.GetInsertPoint(); // Add the termination of the ptx-device subfunction. Builder.SetInsertPoint(ExitBB); Builder.CreateRetVoid(); Builder.SetInsertPoint(LoopBody); *SubFunction = FN; }