Example #1
0
void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) {
  BasicBlock::iterator LoopBody;
  SetVector<Value *> Values;
  SetVector<Value *> IVS;
  std::vector<int> NumIterations;
  PTXGenerator::ValueToValueMapTy VMap;

  assert(!GPUTriple.empty() &&
         "Target triple should be set properly for GPGPU code generation.");
  PTXGenerator PTXGen(Builder, P, GPUTriple);

  // Get original IVS and ScopStmt
  unsigned TiledLoopDepth, NonPLoopDepth;
  const clast_stmt *InnerStmt =
      getScheduleInfo(F, NumIterations, TiledLoopDepth, NonPLoopDepth);
  const clast_stmt *TmpStmt;
  const clast_user_stmt *U;
  const clast_for *InnerFor;
  if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) {
    InnerFor = (const clast_for *)InnerStmt;
    TmpStmt = InnerFor->body;
  } else
    TmpStmt = InnerStmt;
  U = (const clast_user_stmt *)TmpStmt;
  ScopStmt *Statement = (ScopStmt *)U->statement->usr;
  for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) {
    const Value *IV = Statement->getInductionVariableForDimension(i);
    IVS.insert(const_cast<Value *>(IV));
  }

  unsigned OutBytes;
  Values = getGPUValues(OutBytes);
  PTXGen.setOutputBytes(OutBytes);
  PTXGen.startGeneration(Values, IVS, VMap, &LoopBody);

  BasicBlock::iterator AfterLoop = Builder.GetInsertPoint();
  Builder.SetInsertPoint(LoopBody);

  BasicBlock *AfterBB = 0;
  if (NonPLoopDepth) {
    Value *LowerBound, *UpperBound, *IV, *Stride;
    Type *IntPtrTy = getIntPtrTy();
    LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy);
    UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy);
    Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride));
    IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB,
                    CmpInst::ICMP_SLE);
    const Value *OldIV_ = Statement->getInductionVariableForDimension(2);
    Value *OldIV = const_cast<Value *>(OldIV_);
    VMap.insert(std::make_pair<Value *, Value *>(OldIV, IV));
  }

  updateWithValueMap(VMap);

  BlockGenerator::generate(Builder, *Statement, ValueMap, P);

  if (AfterBB)
    Builder.SetInsertPoint(AfterBB->begin());

  // FIXME: The replacement of the host base address with the parameter of ptx
  // subfunction should have been done by updateWithValueMap. We use the
  // following codes to avoid affecting other parts of Polly. This should be
  // fixed later.
  Function *FN = Builder.GetInsertBlock()->getParent();
  for (unsigned j = 0; j < Values.size(); j++) {
    Value *baseAddr = Values[j];
    for (Function::iterator B = FN->begin(); B != FN->end(); ++B) {
      for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I)
        I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]);
    }
  }
  Builder.SetInsertPoint(AfterLoop);
  PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1],
                                NumIterations[2], NumIterations[3]);
  PTXGen.finishGeneration(FN);
}
Example #2
0
void PTXGenerator::createSubfunction(SetVector<Value *> &UsedValues,
                                     SetVector<Value *> &OriginalIVS,
                                     PTXGenerator::ValueToValueMapTy &VMap,
                                     Function **SubFunction) {
  Function *FN = createSubfunctionDefinition(UsedValues.size());
  Module *M = getModule();
  LLVMContext &Context = FN->getContext();
  IntegerType *Ty = Builder.getInt64Ty();

  // Store the previous basic block.
  BasicBlock *PrevBB = Builder.GetInsertBlock();

  // Create basic blocks.
  BasicBlock *HeaderBB = BasicBlock::Create(Context, "ptx.setup", FN);
  BasicBlock *ExitBB = BasicBlock::Create(Context, "ptx.exit", FN);
  BasicBlock *BodyBB = BasicBlock::Create(Context, "ptx.loop_body", FN);

  DominatorTree &DT = P->getAnalysis<DominatorTree>();
  DT.addNewBlock(HeaderBB, PrevBB);
  DT.addNewBlock(ExitBB, HeaderBB);
  DT.addNewBlock(BodyBB, HeaderBB);

  Builder.SetInsertPoint(HeaderBB);

  // Insert VMap items with maps of array base address on the host to base
  // address on the device.
  Function::arg_iterator AI = FN->arg_begin();
  for (unsigned j = 0; j < UsedValues.size(); j++) {
    Value *BaseAddr = UsedValues[j];
    Type *ArrayTy = BaseAddr->getType();
    Value *Param = Builder.CreateBitCast(AI, ArrayTy);
    VMap.insert(std::make_pair<Value *, Value *>(BaseAddr, Param));
    AI++;
  }

  // FIXME: These intrinsics should be inserted on-demand. However, we insert
  // them all currently for simplicity.
  Function *GetNctaidX =
      Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_x);
  Function *GetNctaidY =
      Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_y);
  Function *GetCtaidX =
      Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_x);
  Function *GetCtaidY =
      Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_y);
  Function *GetNtidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_x);
  Function *GetNtidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_y);
  Function *GetTidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_x);
  Function *GetTidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_y);

  Value *GridWidth = Builder.CreateCall(GetNctaidX);
  GridWidth = Builder.CreateIntCast(GridWidth, Ty, false);
  Value *GridHeight = Builder.CreateCall(GetNctaidY);
  GridHeight = Builder.CreateIntCast(GridHeight, Ty, false);
  Value *BlockWidth = Builder.CreateCall(GetNtidX);
  BlockWidth = Builder.CreateIntCast(BlockWidth, Ty, false);
  Value *BlockHeight = Builder.CreateCall(GetNtidY);
  BlockHeight = Builder.CreateIntCast(BlockHeight, Ty, false);
  Value *BIDx = Builder.CreateCall(GetCtaidX);
  BIDx = Builder.CreateIntCast(BIDx, Ty, false);
  Value *BIDy = Builder.CreateCall(GetCtaidY);
  BIDy = Builder.CreateIntCast(BIDy, Ty, false);
  Value *TIDx = Builder.CreateCall(GetTidX);
  TIDx = Builder.CreateIntCast(TIDx, Ty, false);
  Value *TIDy = Builder.CreateCall(GetTidY);
  TIDy = Builder.CreateIntCast(TIDy, Ty, false);

  Builder.CreateBr(BodyBB);
  Builder.SetInsertPoint(BodyBB);

  unsigned NumDims = OriginalIVS.size();
  std::vector<Value *> Substitutions;
  Value *BlockID, *ThreadID;
  switch (NumDims) {
  case 1: {
    Value *BlockSize =
        Builder.CreateMul(BlockWidth, BlockHeight, "p_gpu_blocksize");
    BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
    BlockID = Builder.CreateAdd(BlockID, BIDx);
    BlockID = Builder.CreateMul(BlockID, BlockSize);
    ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
    ThreadID = Builder.CreateAdd(ThreadID, TIDx);
    ThreadID = Builder.CreateAdd(ThreadID, BlockID);
    Substitutions.push_back(ThreadID);
    break;
  }
  case 2: {
    BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
    BlockID = Builder.CreateAdd(BlockID, BIDx);
    Substitutions.push_back(BlockID);
    ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
    ThreadID = Builder.CreateAdd(ThreadID, TIDx);
    Substitutions.push_back(ThreadID);
    break;
  }
  case 3: {
    BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
    BlockID = Builder.CreateAdd(BlockID, BIDx);
    Substitutions.push_back(BlockID);
    Substitutions.push_back(TIDy);
    Substitutions.push_back(TIDx);
    break;
  }
  case 4: {
    Substitutions.push_back(BIDy);
    Substitutions.push_back(BIDx);
    Substitutions.push_back(TIDy);
    Substitutions.push_back(TIDx);
    break;
  }
  default:
    assert(true &&
           "We cannot transform parallel loops whose depth is larger than 4.");
    return;
  }

  assert(OriginalIVS.size() == Substitutions.size() &&
         "The size of IVS should be equal to the size of substitutions.");
  for (unsigned i = 0; i < OriginalIVS.size(); ++i) {
    VMap.insert(
        std::make_pair<Value *, Value *>(OriginalIVS[i], Substitutions[i]));
  }

  Builder.CreateBr(ExitBB);
  Builder.SetInsertPoint(--Builder.GetInsertPoint());
  BasicBlock::iterator LoopBody = Builder.GetInsertPoint();

  // Add the termination of the ptx-device subfunction.
  Builder.SetInsertPoint(ExitBB);
  Builder.CreateRetVoid();

  Builder.SetInsertPoint(LoopBody);
  *SubFunction = FN;
}