Example #1
0
void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
                                               FunctionArgList &Args) {
  // Emit a call to cudaSetupArgument for each arg in Args.
  llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
  CharUnits Offset = CharUnits::Zero();
  for (const VarDecl *A : Args) {
    CharUnits TyWidth, TyAlign;
    std::tie(TyWidth, TyAlign) =
        CGM.getContext().getTypeInfoInChars(A->getType());
    Offset = Offset.alignTo(TyAlign);
    llvm::Value *Args[] = {
        CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
                                      VoidPtrTy),
        llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()),
        llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
    };
    llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
    llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
    llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
    llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
    CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
    CGF.EmitBlock(NextBlock);
    Offset += TyWidth;
  }

  // Emit the call to cudaLaunch
  llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
  llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
  CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
  CGF.EmitBranch(EndBlock);

  CGF.EmitBlock(EndBlock);
}
Example #2
0
/// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen
/// function. Here is the logic:
/// if (Cond) {
///   CodeGen(true);
/// } else {
///   CodeGen(false);
/// }
static void EmitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond,
                            const std::function<void(bool)> &CodeGen) {
  CodeGenFunction::LexicalScope ConditionScope(CGF, Cond->getSourceRange());

  // If the condition constant folds and can be elided, try to avoid emitting
  // the condition and the dead arm of the if/else.
  bool CondConstant;
  if (CGF.ConstantFoldsToSimpleInteger(Cond, CondConstant)) {
    CodeGen(CondConstant);
    return;
  }

  // Otherwise, the condition did not fold, or we couldn't elide it.  Just
  // emit the conditional branch.
  auto ThenBlock = CGF.createBasicBlock(/*name*/ "omp_if.then");
  auto ElseBlock = CGF.createBasicBlock(/*name*/ "omp_if.else");
  auto ContBlock = CGF.createBasicBlock(/*name*/ "omp_if.end");
  CGF.EmitBranchOnBoolExpr(Cond, ThenBlock, ElseBlock, /*TrueCount*/ 0);

  // Emit the 'then' code.
  CGF.EmitBlock(ThenBlock);
  CodeGen(/*ThenBlock*/ true);
  CGF.EmitBranch(ContBlock);
  // Emit the 'else' code if present.
  {
    // There is no need to emit line number for unconditional branch.
    SuppressDebugLocation SDL(CGF.Builder);
    CGF.EmitBlock(ElseBlock);
  }
  CodeGen(/*ThenBlock*/ false);
  {
    // There is no need to emit line number for unconditional branch.
    SuppressDebugLocation SDL(CGF.Builder);
    CGF.EmitBranch(ContBlock);
  }
  // Emit the continuation block for code after the if.
  CGF.EmitBlock(ContBlock, /*IsFinished*/ true);
}
Example #3
0
static void EmitOMPIfStmt(CodeGenFunction &CGF, llvm::Value *IfCond,
                          const std::function<void()> &BodyOpGen) {
  llvm::Value *CallBool = CGF.EmitScalarConversion(
      IfCond,
      CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true),
      CGF.getContext().BoolTy);

  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
  auto *ContBlock = CGF.createBasicBlock("omp_if.end");
  // Generate the branch (If-stmt)
  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
  CGF.EmitBlock(ThenBlock);
  BodyOpGen();
  // Emit the rest of bblocks/branches
  CGF.EmitBranch(ContBlock);
  CGF.EmitBlock(ContBlock, true);
}
void CGNVCUDARuntime::EmitDeviceStubBody(CodeGenFunction &CGF,
                                         FunctionArgList &Args) {
  // Build the argument value list and the argument stack struct type.
  SmallVector<llvm::Value *, 16> ArgValues;
  std::vector<llvm::Type *> ArgTypes;
  for (FunctionArgList::const_iterator I = Args.begin(), E = Args.end();
       I != E; ++I) {
    llvm::Value *V = CGF.GetAddrOfLocalVar(*I);
    ArgValues.push_back(V);
    assert(isa<llvm::PointerType>(V->getType()) && "Arg type not PointerType");
    ArgTypes.push_back(cast<llvm::PointerType>(V->getType())->getElementType());
  }
  llvm::StructType *ArgStackTy = llvm::StructType::get(
      CGF.getLLVMContext(), ArgTypes);

  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");

  // Emit the calls to cudaSetupArgument
  llvm::Constant *cudaSetupArgFn = getSetupArgumentFn();
  for (unsigned I = 0, E = Args.size(); I != E; ++I) {
    llvm::Value *Args[3];
    llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
    Args[0] = CGF.Builder.CreatePointerCast(ArgValues[I], VoidPtrTy);
    Args[1] = CGF.Builder.CreateIntCast(
        llvm::ConstantExpr::getSizeOf(ArgTypes[I]),
        SizeTy, false);
    Args[2] = CGF.Builder.CreateIntCast(
        llvm::ConstantExpr::getOffsetOf(ArgStackTy, I),
        SizeTy, false);
    llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
    llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
    llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero);
    CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock);
    CGF.EmitBlock(NextBlock);
  }

  // Emit the call to cudaLaunch
  llvm::Constant *cudaLaunchFn = getLaunchFn();
  llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
  CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
  CGF.EmitBranch(EndBlock);

  CGF.EmitBlock(EndBlock);
}
Example #5
0
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
// array and kernels are launched using cudaLaunchKernel().
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
                                            FunctionArgList &Args) {
  // Build the shadow stack entry at the very start of the function.

  // Calculate amount of space we will need for all arguments.  If we have no
  // args, allocate a single pointer so we still have a valid pointer to the
  // argument array that we can pass to runtime, even if it will be unused.
  Address KernelArgs = CGF.CreateTempAlloca(
      VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
      llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
  // Store pointers to the arguments in a locally allocated launch_args.
  for (unsigned i = 0; i < Args.size(); ++i) {
    llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
    llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
    CGF.Builder.CreateDefaultAlignedStore(
        VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
  }

  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");

  // Lookup cudaLaunchKernel function.
  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
  //                              void **args, size_t sharedMem,
  //                              cudaStream_t stream);
  TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
  DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
  IdentifierInfo &cudaLaunchKernelII =
      CGM.getContext().Idents.get("cudaLaunchKernel");
  FunctionDecl *cudaLaunchKernelFD = nullptr;
  for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
    if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
      cudaLaunchKernelFD = FD;
  }

  if (cudaLaunchKernelFD == nullptr) {
    CGM.Error(CGF.CurFuncDecl->getLocation(),
              "Can't find declaration for cudaLaunchKernel()");
    return;
  }
  // Create temporary dim3 grid_dim, block_dim.
  ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
  QualType Dim3Ty = GridDimParam->getType();
  Address GridDim =
      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
  Address BlockDim =
      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
  Address ShmemSize =
      CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
  Address Stream =
      CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
  llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
      llvm::FunctionType::get(IntTy,
                              {/*gridDim=*/GridDim.getType(),
                               /*blockDim=*/BlockDim.getType(),
                               /*ShmemSize=*/ShmemSize.getType(),
                               /*Stream=*/Stream.getType()},
                              /*isVarArg=*/false),
      "__cudaPopCallConfiguration");

  CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
                              {GridDim.getPointer(), BlockDim.getPointer(),
                               ShmemSize.getPointer(), Stream.getPointer()});

  // Emit the call to cudaLaunch
  llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
  CallArgList LaunchKernelArgs;
  LaunchKernelArgs.add(RValue::get(Kernel),
                       cudaLaunchKernelFD->getParamDecl(0)->getType());
  LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
  LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
  LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
                       cudaLaunchKernelFD->getParamDecl(3)->getType());
  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
                       cudaLaunchKernelFD->getParamDecl(4)->getType());
  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
                       cudaLaunchKernelFD->getParamDecl(5)->getType());

  QualType QT = cudaLaunchKernelFD->getType();
  QualType CQT = QT.getCanonicalType();
  llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
  llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);

  const CGFunctionInfo &FI =
      CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
  llvm::FunctionCallee cudaLaunchKernelFn =
      CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel");
  CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
               LaunchKernelArgs);
  CGF.EmitBranch(EndBlock);

  CGF.EmitBlock(EndBlock);
}