deviceIdentifier device_t<CUDA>::getIdentifier() const { deviceIdentifier dID; dID.mode_ = CUDA; const size_t archPos = compilerFlags.find("-arch=sm_"); if(archPos == std::string::npos){ OCCA_EXTRACT_DATA(CUDA, Device); std::stringstream archSM_; int major, minor; OCCA_CUDA_CHECK("Getting CUDA Device Arch", cuDeviceComputeCapability(&major, &minor, data_.device) ); archSM_ << major << minor; dID.flagMap["sm_arch"] = archSM_.str(); } else{ const char *c0 = (compilerFlags.c_str() + archPos); const char *c1 = c0; while((*c0 != '\0') && (*c0 != ' ')) ++c1; dID.flagMap["sm_arch"] = std::string(c0, c1 - c0); } return dID; }
void device_t<CUDA>::free(){ OCCA_EXTRACT_DATA(CUDA, Device); OCCA_CUDA_CHECK("Device: Freeing Context", cuCtxDestroy(data_.context) ); delete (CUDADeviceData_t*) data; }
stream device_t<CUDA>::createStream(){ OCCA_EXTRACT_DATA(CUDA, Device); CUstream *retStream = new CUstream; OCCA_CUDA_CHECK("Device: createStream", cuStreamCreate(retStream, CU_STREAM_DEFAULT)); return retStream; }
void device_t<COI>::free(){ OCCA_EXTRACT_DATA(COI, Device); OCCA_COI_CHECK("Device: Freeing Chief Processes", COIProcessDestroy(data_.chiefID, -1, false, NULL, NULL)); delete data; }
stream device_t<COI>::genStream(){ OCCA_EXTRACT_DATA(COI, Device); coiStream *retStream = new coiStream; OCCA_COI_CHECK("Device: Generating a Stream", COIPipelineCreate(data_.chiefID, NULL, 0, &(retStream->handle)) ); return retStream; }
int device_t<CUDA>::simdWidth(){ if(simdWidth_) return simdWidth_; OCCA_EXTRACT_DATA(CUDA, Device); OCCA_CUDA_CHECK("Device: Get Warp Size", cuDeviceGetAttribute(&simdWidth_, CU_DEVICE_ATTRIBUTE_WARP_SIZE, data_.device) ); return simdWidth_; }
kernel_t<CUDA>* kernel_t<CUDA>::buildFromBinary(const std::string &filename, const std::string &functionName_){ OCCA_EXTRACT_DATA(CUDA, Kernel); functionName = functionName_; OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Module", cuModuleLoad(&data_.module, filename.c_str())); OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Function", cuModuleGetFunction(&data_.function, data_.module, functionName.c_str())); return this; }
kernel_t<CUDA>* kernel_t<CUDA>::loadFromLibrary(const char *cache, const std::string &functionName_){ OCCA_EXTRACT_DATA(CUDA, Kernel); functionName = functionName_; OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Module", cuModuleLoadData(&data_.module, cache)); OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Function", cuModuleGetFunction(&data_.function, data_.module, functionName.c_str())); return this; }
kernel_v* device_t<COI>::buildKernelFromBinary(const std::string &filename, const std::string &functionName){ OCCA_EXTRACT_DATA(COI, Device); kernel_v *k = new kernel_t<COI>; k->dev = dev; k->data = new COIKernelData_t; COIKernelData_t &kData_ = *((COIKernelData_t*) k->data); kData_.chiefID = data_.chiefID; k->buildFromBinary(filename, functionName); return k; }
kernel_v* device_t<CUDA>::loadKernelFromLibrary(const char *cache, const std::string &functionName){ OCCA_EXTRACT_DATA(CUDA, Device); kernel_v *k = new kernel_t<CUDA>; k->dev = dev; k->data = new CUDAKernelData_t; CUDAKernelData_t &kData_ = *((CUDAKernelData_t*) k->data); kData_.device = data_.device; kData_.context = data_.context; k->loadFromLibrary(cache, functionName); return k; }
kernel_v* device_t<CUDA>::buildKernelFromBinary(const std::string &filename, const std::string &functionName){ OCCA_EXTRACT_DATA(CUDA, Device); kernel_v *k = new kernel_t<CUDA>; k->dev = dev; k->data = new CUDAKernelData_t; CUDAKernelData_t &kData_ = *((CUDAKernelData_t*) k->data); kData_.device = data_.device; kData_.context = data_.context; k->buildFromBinary(filename, functionName); return k; }
memory_v* device_t<CUDA>::malloc(const uintptr_t bytes, void *source){ OCCA_EXTRACT_DATA(CUDA, Device); memory_v *mem = new memory_t<CUDA>; mem->dev = dev; mem->handle = new CUdeviceptr; mem->size = bytes; OCCA_CUDA_CHECK("Device: malloc", cuMemAlloc((CUdeviceptr*) mem->handle, bytes)); if(source != NULL) mem->copyFrom(source, bytes, 0); return mem; }
void device_t<CUDA>::setup(argInfoMap &aim){ cuda::init(); data = new CUDADeviceData_t; OCCA_EXTRACT_DATA(CUDA, Device); if(!aim.has("deviceID")){ std::cout << "[CUDA] device not given [deviceID]\n"; throw 1; } const int deviceID = aim.iGet("deviceID"); OCCA_CUDA_CHECK("Device: Creating Device", cuDeviceGet(&data_.device, deviceID)); OCCA_CUDA_CHECK("Device: Creating Context", cuCtxCreate(&data_.context, CU_CTX_SCHED_AUTO, data_.device)); }
std::string kernel_t<CUDA>::getCachedBinaryName(const std::string &filename, kernelInfo &info_){ OCCA_EXTRACT_DATA(CUDA, Kernel); info_.addDefine("OCCA_USING_GPU" , 1); info_.addDefine("OCCA_USING_CUDA", 1); info_.addOCCAKeywords(occaCUDADefines); std::stringstream salt; salt << "CUDA" << info_.salt() << parser::version << dev->dHandle->compilerEnvScript << dev->dHandle->compiler << dev->dHandle->compilerFlags; return getCachedName(filename, salt.str()); }
memory_v* device_t<COI>::malloc(const uintptr_t bytes, void *source){ OCCA_EXTRACT_DATA(COI, Device); memory_v *mem = new memory_t<COI>; mem->dev = dev; mem->handle = new coiMemory; mem->size = bytes; OCCA_COI_CHECK("Device: Malloc", COIBufferCreate(bytes, COI_BUFFER_NORMAL, 0, source, 1, &(data_.chiefID), (coiMemory*) mem->handle) ); return mem; }
kernel_t<COI>* kernel_t<COI>::buildFromBinary(const std::string &filename, const std::string &functionName_){ OCCA_EXTRACT_DATA(COI, Kernel); functionName = functionName_; std::string libPath, soname; getFilePrefixAndName(filename, libPath, soname); for(int i = 0; i < soname.size(); ++i){ if(soname[i] == '.'){ soname = soname.substr(0, i); break; } } COILIBRARY outLibrary; OCCA_COI_CHECK("Kernel: Loading Kernel To Chief", COIProcessLoadLibraryFromFile(data_.chiefID, filename.c_str(), soname.c_str(), NULL, &outLibrary)); const char *c_functionName = functionName.c_str(); OCCA_COI_CHECK("Kernel: Getting Handle", COIProcessGetFunctionHandles(data_.chiefID, 1, &c_functionName, &(data_.kernel))); return this; }
kernel_t<COI>* kernel_t<COI>::buildFromSource(const std::string &filename, const std::string &functionName_, const kernelInfo &info_){ functionName = functionName_; kernelInfo info = info_; info.addDefine("OCCA_USING_CPU", 1); info.addDefine("OCCA_USING_COI", 1); info.addOCCAKeywords(occaCOIDefines); std::stringstream salt; salt << "COI" << info.salt() << dev->dHandle->compilerEnvScript << dev->dHandle->compiler << dev->dHandle->compilerFlags << functionName; std::string cachedBinary = getCachedName(filename, salt.str()); std::string libPath, soname; getFilePrefixAndName(cachedBinary, libPath, soname); std::string libName = "lib" + soname + ".so"; cachedBinary = libPath + libName; struct stat buffer; bool fileExists = (stat(cachedBinary.c_str(), &buffer) == 0); if(fileExists){ std::cout << "Found cached binary of [" << filename << "] in [" << cachedBinary << "]\n"; return buildFromBinary(cachedBinary, functionName); } if(!haveFile(cachedBinary)){ waitForFile(cachedBinary); return buildFromBinary(cachedBinary, functionName); } std::string iCachedBinary = createIntermediateSource(filename, cachedBinary, info); std::stringstream command; if(dev->dHandle->compilerEnvScript.size()) command << dev->dHandle->compilerEnvScript << " && "; command << dev->dHandle->compiler #if (OCCA_OS == LINUX_OS) || (OCCA_OS == OSX_OS) << " -x c++ -w -nodefaultlibs -fPIC -shared" #else << " /TP /LD /D MC_CL_EXE" #endif << ' ' << dev->dHandle->compilerFlags << ' ' << info.flags << ' ' << iCachedBinary #if (OCCA_OS == LINUX_OS) || (OCCA_OS == OSX_OS) << " -o " << cachedBinary #else << " /link /OUT:" << cachedBinary #endif << std::endl; const std::string &sCommand = command.str(); std::cout << "Compiling [" << functionName << "]\n" << sCommand << "\n"; const int compileError = system(sCommand.c_str()); if(compileError){ releaseFile(cachedBinary); throw 1; } OCCA_EXTRACT_DATA(COI, Kernel); COILIBRARY outLibrary; const COIRESULT loadingLibraryResult = COIProcessLoadLibraryFromFile(data_.chiefID, cachedBinary.c_str(), soname.c_str(), NULL, &outLibrary); if(errorCode != COI_SUCCESS) releaseFile(cachedBinary); OCCA_COI_CHECK("Kernel: Loading Kernel To Chief", loadingLibraryResult); const char *c_functionName = functionName.c_str(); const COIRESULT getFunctionHandleResult = COIProcessGetFunctionHandles(data_.chiefID, 1, &c_functionName, &(data_.kernel)); if(errorCode != COI_SUCCESS) releaseFile(cachedBinary); OCCA_COI_CHECK("Kernel: Getting Handle", getFunctionHandleResult); releaseFile(cachedBinary); return this; }
memory_v* device_t<CUDA>::talloc(const int dim, const occa::dim &dims, void *source, occa::formatType type, const int permissions){ OCCA_EXTRACT_DATA(CUDA, Device); memory_v *mem = new memory_t<CUDA>; mem->dev = dev; mem->handle = new CUDATextureData_t; mem->size = ((dim == 1) ? dims.x : (dims.x * dims.y)) * type.bytes(); mem->isTexture = true; mem->textureInfo.dim = dim; mem->textureInfo.w = dims.x; mem->textureInfo.h = dims.y; mem->textureInfo.d = dims.z; mem->textureInfo.bytesInEntry = type.bytes(); CUarray &array = ((CUDATextureData_t*) mem->handle)->array; CUsurfObject &surface = ((CUDATextureData_t*) mem->handle)->surface; CUDA_ARRAY_DESCRIPTOR arrayDesc; CUDA_RESOURCE_DESC surfDesc; memset(&arrayDesc, 0, sizeof(arrayDesc)); memset(&surfDesc , 0, sizeof(surfDesc)); arrayDesc.Width = dims.x; arrayDesc.Height = (dim == 1) ? 0 : dims.y; arrayDesc.Format = *((CUarray_format*) type.format<CUDA>()); arrayDesc.NumChannels = type.count(); OCCA_CUDA_CHECK("Device: Creating Array", cuArrayCreate(&array, (CUDA_ARRAY_DESCRIPTOR*) &arrayDesc) ); surfDesc.res.array.hArray = array; surfDesc.resType = CU_RESOURCE_TYPE_ARRAY; OCCA_CUDA_CHECK("Device: Creating Surface Object", cuSurfObjectCreate(&surface, &surfDesc) ); mem->textureInfo.arg = new int; *((int*) mem->textureInfo.arg) = CUDA_ADDRESS_CLAMP; mem->copyFrom(source); /* if(dims == 3){ CUDA_ARRAY3D_DESCRIPTOR arrayDesc; memset(&arrayDesc, 0, sizeof(arrayDesc); arrayDesc.Width = size.x; arrayDesc.Height = size.y; arrayDesc.Depth = size.z; arrayDesc.Format = type.format<CUDA>(); arrayDesc.NumChannels = type.count(); cuArray3DCreate(&arr, (CUDA_ARRAY3D_DESCRIPTOR*) &arrayDesc); } */ return mem; }
kernel_t<CUDA>* kernel_t<CUDA>::buildFromSource(const std::string &filename, const std::string &functionName_, const kernelInfo &info_){ OCCA_EXTRACT_DATA(CUDA, Kernel); functionName = functionName_; kernelInfo info = info_; std::string cachedBinary = getCachedBinaryName(filename, info); struct stat buffer; const bool fileExists = (stat(cachedBinary.c_str(), &buffer) == 0); if(fileExists){ std::cout << "Found cached binary of [" << filename << "] in [" << cachedBinary << "]\n"; return buildFromBinary(cachedBinary, functionName); } if(!haveFile(cachedBinary)){ waitForFile(cachedBinary); return buildFromBinary(cachedBinary, functionName); } std::string iCachedBinary = createIntermediateSource(filename, cachedBinary, info); std::string libPath, soname; getFilePrefixAndName(cachedBinary, libPath, soname); std::string oCachedBinary = libPath + "o_" + soname + ".o"; std::string archSM = ""; if(dev->dHandle->compilerFlags.find("-arch=sm_") == std::string::npos){ std::stringstream archSM_; int major, minor; OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Getting CUDA Device Arch", cuDeviceComputeCapability(&major, &minor, data_.device) ); archSM_ << " -arch=sm_" << major << minor << ' '; archSM = archSM_.str(); } std::stringstream command; //---[ PTX Check Command ]---------- if(dev->dHandle->compilerEnvScript.size()) command << dev->dHandle->compilerEnvScript << " && "; command << dev->dHandle->compiler << ' ' << dev->dHandle->compilerFlags << archSM << " -Xptxas -v,-dlcm=cg,-abi=no" << ' ' << info.flags << " -x cu -c " << iCachedBinary << " -o " << oCachedBinary; const std::string &ptxCommand = command.str(); std::cout << "Compiling [" << functionName << "]\n" << ptxCommand << "\n"; #if (OCCA_OS == LINUX_OS) || (OCCA_OS == OSX_OS) const int ptxError = system(ptxCommand.c_str()); #else const int ptxError = system(("\"" + ptxCommand + "\"").c_str()); #endif // Not needed here I guess // if(ptxError){ // releaseFile(cachedBinary); // throw 1; // } //---[ Compiling Command ]---------- command.str(""); command << dev->dHandle->compiler << " -o " << cachedBinary << " -ptx -I." << ' ' << dev->dHandle->compilerFlags << archSM << ' ' << info.flags << " -x cu " << iCachedBinary; const std::string &sCommand = command.str(); std::cout << sCommand << '\n'; const int compileError = system(sCommand.c_str()); if(compileError){ releaseFile(cachedBinary); throw 1; } const CUresult moduleLoadError = cuModuleLoad(&data_.module, cachedBinary.c_str()); if(moduleLoadError) releaseFile(cachedBinary); OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Module", moduleLoadError); const CUresult moduleGetFunctionError = cuModuleGetFunction(&data_.function, data_.module, functionName.c_str()); if(moduleGetFunctionError) releaseFile(cachedBinary); OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Function", moduleGetFunctionError); releaseFile(cachedBinary); return this; }
void device_t<COI>::setup(const int device, const int memoryAllocated){ data = new COIDeviceData_t; OCCA_EXTRACT_DATA(COI, Device); uint32_t deviceCount; OCCA_COI_CHECK("Device: Get Count", COIEngineGetCount(COI_ISA_MIC, &deviceCount)); OCCA_CHECK(device < deviceCount); OCCA_COI_CHECK("Device: Get Handle", COIEngineGetHandle(COI_ISA_MIC, device, &data_.deviceID) ); std::stringstream salt; salt << "COI" << occaCOIMain; std::string cachedBinary = getCachedName("occaCOIMain", salt.str()); struct stat buffer; bool fileExists = (stat(cachedBinary.c_str(), &buffer) == 0); if(fileExists) std::cout << "Found cached binary of [occaCOIMain] in [" << cachedBinary << "]\n"; else{ //---[ Write File ]----------------- std::string prefix, name; getFilePrefixAndName(cachedBinary, prefix, name); const std::string iCachedBinary = prefix + "i_" + name; if(haveFile(cachedBinary)){ std::cout << "Making [" << iCachedBinary << "]\n"; std::ofstream fs; fs.open(iCachedBinary.c_str()); fs << occaCOIMain; fs.close(); std::stringstream command; command << dev->dHandle->compiler << " -o " << cachedBinary << " -x c++" << ' ' << dev->dHandle->compilerFlags << ' ' << iCachedBinary; const std::string &sCommand = command.str(); std::cout << "Compiling [" << functionName << "]\n" << sCommand << "\n\n"; system(sCommand.c_str()); releaseFile(cachedBinary); } else waitForFile(cachedBinary); } // [-] Tentative std::string SINK_LD_LIBRARY_PATH; char *c_SINK_LD_LIBRARY_PATH = getenv("SINK_LD_LIBRARY_PATH"); if(c_SINK_LD_LIBRARY_PATH != NULL) SINK_LD_LIBRARY_PATH = std::string(c_SINK_LD_LIBRARY_PATH); OCCA_COI_CHECK("Device: Initializing", COIProcessCreateFromFile(data_.deviceID, cachedBinary.c_str(), 0 , NULL, true, NULL, true, NULL, memoryAllocated ? memoryAllocated : (4 << 30), // 4 GB SINK_LD_LIBRARY_PATH.c_str(), &(data_.chiefID)) ); const char *kernelNames[] = {"occaKernelWith1Argument" , "occaKernelWith2Arguments" , "occaKernelWith3Arguments" , "occaKernelWith4Arguments" , "occaKernelWith5Arguments" , "occaKernelWith6Arguments" , "occaKernelWith7Arguments" , "occaKernelWith8Arguments" , "occaKernelWith9Arguments" , "occaKernelWith10Arguments", "occaKernelWith11Arguments", "occaKernelWith12Arguments", "occaKernelWith13Arguments", "occaKernelWith14Arguments", "occaKernelWith15Arguments", "occaKernelWith16Arguments", "occaKernelWith17Arguments", "occaKernelWith18Arguments", "occaKernelWith19Arguments", "occaKernelWith20Arguments", "occaKernelWith21Arguments", "occaKernelWith22Arguments", "occaKernelWith23Arguments", "occaKernelWith24Arguments", "occaKernelWith25Arguments"}; // [-] More hard-coding, if you know what I mean OCCA_COI_CHECK("Device: Getting Kernel Wrappers", COIProcessGetFunctionHandles(data_.chiefID, 25, kernelNames, data_.kernelWrapper)); }