int UdpSocket::send(Address addr, unsigned short port, const char* bytes, int len) { if (addr.getType() != addr_type) return 0; char addr_buf[sizeof(sockaddr_in6)]; memset(addr_buf, 0, sizeof(sockaddr_in6)); if (addr_type == Address::IPV4) { #if !defined(__ANDROID__) ((sockaddr_in*)addr_buf)->sin_len = sizeof(sockaddr_in); #endif ((sockaddr_in*)addr_buf)->sin_family = AF_INET; memcpy(&((sockaddr_in*)addr_buf)->sin_addr, addr.getAddr(), sizeof(in_addr)); ((sockaddr_in*)addr_buf)->sin_port = htons(port); } else { #if !defined(__ANDROID__) ((sockaddr_in6*)addr_buf)->sin6_len = sizeof(sockaddr_in6); #endif ((sockaddr_in6*)addr_buf)->sin6_family = AF_INET6; memcpy(&((sockaddr_in6*)addr_buf)->sin6_addr, addr.getAddr(), sizeof(in6_addr)); ((sockaddr_in*)addr_buf)->sin_port = htons(port); } return (int)sendto(fd, bytes, len, 0, (sockaddr*)addr_buf, addr_type == Address::IPV4 ? sizeof(sockaddr_in) : sizeof(sockaddr_in6)); }
DominatingValue<RValue>::saved_type DominatingValue<RValue>::saved_type::save(CodeGenFunction &CGF, RValue rv) { if (rv.isScalar()) { llvm::Value *V = rv.getScalarVal(); // These automatically dominate and don't need to be saved. if (!DominatingLLVMValue::needsSaving(V)) return saved_type(V, ScalarLiteral); // Everything else needs an alloca. Address addr = CGF.CreateDefaultAlignTempAlloca(V->getType(), "saved-rvalue"); CGF.Builder.CreateStore(V, addr); return saved_type(addr.getPointer(), ScalarAddress); } if (rv.isComplex()) { CodeGenFunction::ComplexPairTy V = rv.getComplexVal(); llvm::Type *ComplexTy = llvm::StructType::get(V.first->getType(), V.second->getType(), (void*) nullptr); Address addr = CGF.CreateDefaultAlignTempAlloca(ComplexTy, "saved-complex"); CGF.Builder.CreateStore(V.first, CGF.Builder.CreateStructGEP(addr, 0, CharUnits())); CharUnits offset = CharUnits::fromQuantity( CGF.CGM.getDataLayout().getTypeAllocSize(V.first->getType())); CGF.Builder.CreateStore(V.second, CGF.Builder.CreateStructGEP(addr, 1, offset)); return saved_type(addr.getPointer(), ComplexAddress); } assert(rv.isAggregate()); Address V = rv.getAggregateAddress(); // TODO: volatile? if (!DominatingLLVMValue::needsSaving(V.getPointer())) return saved_type(V.getPointer(), AggregateLiteral, V.getAlignment().getQuantity()); Address addr = CGF.CreateTempAlloca(V.getType(), CGF.getPointerAlign(), "saved-rvalue"); CGF.Builder.CreateStore(V.getPointer(), addr); return saved_type(addr.getPointer(), AggregateAddress, V.getAlignment().getQuantity()); }
Address GMachine::executeMain() { SuperCombinator* main = getCombinator("main"); int mainIndex = -1; for (size_t ii = 0; ii < globals.size(); ++ii) { Address addr = globals[ii]; if (addr.getType() == GLOBAL && addr.getNode()->global == main) { mainIndex = ii; break; } } assert(mainIndex != -1); SuperCombinator sc; sc.name == "__main"; sc.arity = 0; sc.instructions = std::vector<GInstruction> { GInstruction(GOP::PUSH_GLOBAL, mainIndex), GInstruction(GOP::EVAL) }; return evaluate(sc); }
void GMachine::execute(GEnvironment& environment) { const std::vector<GInstruction>& code = environment.combinator->instructions; StackFrame<Address>& stack = environment.stack; for (size_t index = 0; index < code.size(); index++) { const GInstruction& instruction = code[index]; switch (instruction.op) { case GOP::ALLOC: { for (int i = 0; i < instruction.value; i++) { heap.push_back(Node(nullptr)); environment.stack.push(Address::indirection(&heap.back())); } } break; case GOP::EVAL: { static SuperCombinator unwind { "__uniwnd", Type(), 0, std::vector<GInstruction>{ GInstruction(GOP::UNWIND) } }; GEnvironment child = environment.child(&unwind); child.stack.push(environment.stack.top()); execute(child); environment.stack.top() = child.stack.top(); } break; case GOP::MKAP: { Address func = environment.stack.top(); environment.stack.pop(); Address arg = environment.stack.top(); heap.push_back(Node(func, arg)); environment.stack.top() = Address::application(&heap.back()); } break; case GOP::PACK: { int tag = instruction.value & (0xFFFF);//Low two bytes int arity = instruction.value >> 16;//High two bytes heap.push_back(Node(tag, new Address[arity+1])); Node& ctor = heap.back(); for (int ii = 0; ii < arity; ii++) { ctor.constructor.arguments[ii] = stack.pop(); } ctor.constructor.arguments[arity + 1] = Address::indirection(nullptr);//Use as end of this constructor stack.push(Address::constructor(&ctor)); } break; case GOP::SPLIT: { Address top = stack.pop(); assert(top.getType() == CONSTRUCTOR); ConstructorNode& ctor = top.getNode()->constructor; for (int ii = 0; ii < instruction.value; ii++) { assert(ctor.arguments[ii].getType() != NodeType::INDIRECTION || ctor.arguments[ii].getNode() != nullptr); stack.push(ctor.arguments[ii]); } } break; case GOP::CASEJUMP: { Address top = stack.top(); assert(top.getType() == CONSTRUCTOR); ConstructorNode& ctor = top.getNode()->constructor; if (ctor.tag != instruction.value) index++;//Skip the next instruction which is the jump instruction } break; case GOP::JUMP: index = instruction.value - 1; break; case GOP::POP: for (int i = 0; i < instruction.value; i++) { environment.stack.pop(); } break; case GOP::PUSH: { Address addr = environment.stack[instruction.value]; environment.stack.push(addr); } break; case GOP::PUSH_DICTIONARY_MEMBER: { assert(stack.base().getType() == NodeType::CONSTRUCTOR);//Must be instance dictionary ConstructorNode& ctor = stack.base().getNode()->constructor; Address& func = ctor.arguments[instruction.value]; stack.push(func); } break; case GOP::PUSH_GLOBAL: { Address addr = globals.at(instruction.value); environment.stack.push(addr); } break; case GOP::PUSH_INT: { heap.push_back(Node(instruction.value)); environment.stack.push(Address::number(&heap.back())); } break; case GOP::PUSH_DOUBLE: { heap.push_back(Node(instruction.doubleValue)); environment.stack.push(Address::numberDouble(&heap.back())); } break; case GOP::SLIDE: { slide(environment, instruction); } break; case GOP::UNWIND: { Address top = environment.stack.top(); switch (top.getType()) { case NUMBER: break; case APPLICATION: { Node& n = *top.getNode(); environment.stack.push(n.apply.func); --index;//Redo the unwind instruction } break; case FUNCTION_POINTER: { int arity = top.getNode()->function.args; if (stack.stackSize() - 1 < size_t(arity)) { while (stack.stackSize() > 1) { stack.pop(); } } else { size_t ii = environment.stack.stackSize() - arity - 1; for (; ii < environment.stack.stackSize() - 1; ii++) { Address& addr = environment.stack[ii]; assert(addr.getType() == APPLICATION); addr = addr.getNode()->apply.arg; } t_ffi_func func = top.getNode()->function.ptr; assert(func != nullptr); StackFrame<Address> newStack = stack.makeChildFrame(arity + 1); func(this, &newStack); Address result = newStack.top(); for (int i = 0; i < arity; i++) environment.stack.pop(); environment.stack.push(result); } } break; case GLOBAL: { SuperCombinator* comb = top.getNode()->global; if (environment.stack.stackSize() - 1 < size_t(comb->arity)) { while (stack.stackSize() > 1) { stack.pop(); } } else { //Before calling the function, replace all applications on the stack with the actual arguments //This gives faster access to a functions arguments when using PUSH size_t ii = environment.stack.stackSize() - comb->arity - 1; for (; ii < environment.stack.stackSize() - 1; ii++) { Address& addr = environment.stack[ii]; assert(addr.getType() == APPLICATION); addr = addr.getNode()->apply.arg; } GEnvironment child = environment.child(comb); if (debug) { std::cerr << "Executing function '" << comb->name << "'" << std::endl; std::cerr << "Arguments { "; for (size_t i = 0; i < child.stack.stackSize(); i++) { std::cerr << child.stack[i]; } std::cerr << " }" << std::endl; } execute(child); Address result = child.stack.top(); for (int i = 0; i < comb->arity; i++) environment.stack.pop(); environment.stack.push(result); } } break; case INDIRECTION: { environment.stack.top() = top.getNode()->indirection; --index;//Redo the unwind instruction } break; default: break; } } break; case GOP::UPDATE: { Address top = environment.stack.top(); heap.push_back(Node(top)); environment.stack[instruction.value] = Address::indirection(&heap.back()); } break; #define BINOP2(op, opname) \ case GOP:: opname:\ {\ Address rhs = environment.stack.pop(); \ Address lhs = environment.stack.top(); \ int result = lhs.getNode()->number op rhs.getNode()->number; \ heap.push_back(Node(result)); \ environment.stack.top() = Address::number(&heap.back()); \ }\ break; #define BINOP(f, name) case GOP:: name: binopInt<f>(environment, heap); break; BINOP(add<int>, ADD) BINOP(subtract<int>, SUBTRACT) BINOP(multiply<int>, MULTIPLY) BINOP(divide<int>, DIVIDE) BINOP(remainder<int>, REMAINDER) #define BINOP_DOUBLE(f, name) case GOP:: name: binopDouble<f>(environment, heap); break; BINOP_DOUBLE(add<double>, ADD_DOUBLE) BINOP_DOUBLE(subtract<double>, SUBTRACT_DOUBLE) BINOP_DOUBLE(multiply<double>, MULTIPLY_DOUBLE) BINOP_DOUBLE(divide<double>, DIVIDE_DOUBLE) #undef BINOP_DOUBLE case GOP::NEGATE: { Address x = environment.stack.top(); heap.push_back(Node(-x.getNode()->number)); environment.stack.top() = Address::number(&heap.back()); } break; BINOP2(== , COMPARE_EQ) BINOP2(!= , COMPARE_NEQ) BINOP2(> , COMPARE_GT) BINOP2(>=, COMPARE_GE) BINOP2(< , COMPARE_LT) BINOP2(<=, COMPARE_LE) #undef BINOP #undef BINOP2 default: std::cout << "Unimplemented instruction " << int(code[index].op) << std::endl; break; } } if (debug) { std::cerr << "Returning '" << stack.top() << "' from '" << environment.combinator->name << "'" << std::endl; } }
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local // array and kernels are launched using cudaLaunchKernel(). void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args) { // Build the shadow stack entry at the very start of the function. // Calculate amount of space we will need for all arguments. If we have no // args, allocate a single pointer so we still have a valid pointer to the // argument array that we can pass to runtime, even if it will be unused. Address KernelArgs = CGF.CreateTempAlloca( VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); // Store pointers to the arguments in a locally allocated launch_args. for (unsigned i = 0; i < Args.size(); ++i) { llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); CGF.Builder.CreateDefaultAlignedStore( VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); } llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); // Lookup cudaLaunchKernel function. // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, // void **args, size_t sharedMem, // cudaStream_t stream); TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); IdentifierInfo &cudaLaunchKernelII = CGM.getContext().Idents.get("cudaLaunchKernel"); FunctionDecl *cudaLaunchKernelFD = nullptr; for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) cudaLaunchKernelFD = FD; } if (cudaLaunchKernelFD == nullptr) { CGM.Error(CGF.CurFuncDecl->getLocation(), "Can't find declaration for cudaLaunchKernel()"); return; } // Create temporary dim3 grid_dim, block_dim. ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); QualType Dim3Ty = GridDimParam->getType(); Address GridDim = CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); Address BlockDim = CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); Address ShmemSize = CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); Address Stream = CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( llvm::FunctionType::get(IntTy, {/*gridDim=*/GridDim.getType(), /*blockDim=*/BlockDim.getType(), /*ShmemSize=*/ShmemSize.getType(), /*Stream=*/Stream.getType()}, /*isVarArg=*/false), "__cudaPopCallConfiguration"); CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.getPointer(), BlockDim.getPointer(), ShmemSize.getPointer(), Stream.getPointer()}); // Emit the call to cudaLaunch llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); CallArgList LaunchKernelArgs; LaunchKernelArgs.add(RValue::get(Kernel), cudaLaunchKernelFD->getParamDecl(0)->getType()); LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), cudaLaunchKernelFD->getParamDecl(3)->getType()); LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), cudaLaunchKernelFD->getParamDecl(4)->getType()); LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), cudaLaunchKernelFD->getParamDecl(5)->getType()); QualType QT = cudaLaunchKernelFD->getType(); QualType CQT = QT.getCanonicalType(); llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty); const CGFunctionInfo &FI = CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); llvm::FunctionCallee cudaLaunchKernelFn = CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel"); CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), LaunchKernelArgs); CGF.EmitBranch(EndBlock); CGF.EmitBlock(EndBlock); }