static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS) { Type *I32Ty = Builder.getInt32Ty(); Type *I64Ty = Builder.getInt64Ty(); Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty); Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty); Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64); Value *Lo = Builder.CreateTrunc(MUL64, I32Ty); Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32)); Hi = Builder.CreateTrunc(Hi, I32Ty); return std::make_pair(Lo, Hi); }
std::pair<Value *, Value *> AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { if (!IsAMDHSA) { Function *LocalSizeYFn = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y); Function *LocalSizeZFn = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z); CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {}); CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {}); LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); return std::make_pair(LocalSizeY, LocalSizeZ); } // We must read the size out of the dispatch pointer. assert(IsAMDGCN); // We are indexing into this struct, and want to extract the workgroup_size_* // fields. // // typedef struct hsa_kernel_dispatch_packet_s { // uint16_t header; // uint16_t setup; // uint16_t workgroup_size_x ; // uint16_t workgroup_size_y; // uint16_t workgroup_size_z; // uint16_t reserved0; // uint32_t grid_size_x ; // uint32_t grid_size_y ; // uint32_t grid_size_z; // // uint32_t private_segment_size; // uint32_t group_segment_size; // uint64_t kernel_object; // // #ifdef HSA_LARGE_MODEL // void *kernarg_address; // #elif defined HSA_LITTLE_ENDIAN // void *kernarg_address; // uint32_t reserved1; // #else // uint32_t reserved1; // void *kernarg_address; // #endif // uint64_t reserved2; // hsa_signal_t completion_signal; // uint64_t wrapper // } hsa_kernel_dispatch_packet_t // Function *DispatchPtrFn = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr); CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {}); DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias); DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); // Size of the dispatch packet struct. DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64); Type *I32Ty = Type::getInt32Ty(Mod->getContext()); Value *CastDispatchPtr = Builder.CreateBitCast( DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS)); // We could do a single 64-bit load here, but it's likely that the basic // 32-bit and extract sequence is already present, and it is probably easier // to CSE this. The loads should be mergable later anyway. Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1); LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4); Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2); LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4); MDNode *MD = llvm::MDNode::get(Mod->getContext(), None); LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD); LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); // Extract y component. Upper half of LoadZU should be zero already. Value *Y = Builder.CreateLShr(LoadXY, 16); return std::make_pair(Y, LoadZU); }