CUresult updateConstantMemory_drvapi(CUmodule module, float *hueCSC) { CUdeviceptr d_constHueCSC, d_constAlpha; size_t d_cscBytes, d_alphaBytes; // First grab the global device pointers from the CUBIN cuModuleGetGlobal(&d_constHueCSC, &d_cscBytes , module, "constHueColorSpaceMat"); cuModuleGetGlobal(&d_constAlpha , &d_alphaBytes, module, "constAlpha"); CUresult error = CUDA_SUCCESS; // Copy the constants to video memory cuMemcpyHtoD(d_constHueCSC, reinterpret_cast<const void *>(hueCSC), d_cscBytes); getLastCudaDrvErrorMsg("cuMemcpyHtoD (d_constHueCSC) copy to Constant Memory failed"); uint32 cudaAlpha = ((uint32)0xff<< 24); cuMemcpyHtoD(d_constAlpha, reinterpret_cast<const void *>(&cudaAlpha), d_alphaBytes); getLastCudaDrvErrorMsg("cuMemcpyHtoD (d_constAlpha) copy to Constant Memory failed"); return error; }
int main(int argc, char ** argv){ int i; if( (argc>=2) && (atoi(argv[1])!=RANK)) error("rank %d mandatory",RANK); printf("CUDA RANK=%d\n",RANK); kernel.print(); // build busylist busylist = (uint32_t*)malloc_file(CNK*sizeof(uint32_t),FMODE_RO,BLIST_FORMAT,RANK); // put busylist SafeCall(cuMemHostRegister(busylist,CNK*sizeof*busylist,CU_MEMHOSTREGISTER_DEVICEMAP)); SafeCall(cuMemHostGetDevicePointer(&host_busylist,busylist,0)); SafeCall(cuModuleGetGlobal(&dev_busylist,&bytes,kernel.module[0].module,"busylist")); if(bytes!=sizeof(host_busylist)) error("busylist!"); SafeCall(cuMemcpyHtoD(dev_busylist,&host_busylist,bytes)); // put array #ifdef IN_mk_data mkdir(DATADIR,0755); errno=0; array = (unsigned char *)malloc_file(abytes(RANK,CNK),1,DATADIR"%d",RANK); #else array = (unsigned char *)malloc_file(abytes(RANK,CNK),0,DATADIR"%d",RANK); #endif SafeCall(cuMemHostRegister(array,abytes(RANK,CNK),CU_MEMHOSTREGISTER_DEVICEMAP)); SafeCall(cuMemHostGetDevicePointer(&host_array,array,0)); SafeCall(cuModuleGetGlobal(&dev_array,&bytes,kernel.module[0].module,"array")); if(bytes!=sizeof(host_array)) error("array!"); SafeCall(cuMemcpyHtoD(dev_array,&host_array,bytes)); #define THREADS 512 #define MAXG 65535 uint64_t nado = (cnk[RANK] +(THREADS-1))/THREADS; uint32_t gridx = nado>MAXG?MAXG:nado; uint32_t gridy = (nado+(MAXG-1))/MAXG; printf("gridy=%d gridx=%d THREAD=%d\n",gridy, gridx, THREADS); kernel.launch(params,THREADS,gridx,gridy); kernel.wait(); SafeCall(cuMemHostUnregister(busylist)); SafeCall(cuMemHostUnregister(array)); SafeCall(cuModuleGetGlobal(&dev_changed,&bytes,kernel.module[0].module,"changed")); if(bytes!=sizeof(changed)) error("changed!"); SafeCall(cuMemcpyDtoH(changed,dev_changed,bytes)); for(i=0;i<CACHESIZE;i++) total += changed[i]; printf("changed=%ju\n",total); return 0; }
void setupModuleResource(const char* kernelFileName) { CUmodule newModule = createModuleFromFile(kernelFileName); if (newModule != NULL) { if (module != NULL) cuModuleUnload(module); module = newModule; } checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "mainImage")); // TODO: take care of bytes size_t bytes; checkCudaErrors(cuModuleGetGlobal(&d_iResolution, &bytes, module, "iResolution")); checkCudaErrors(cuModuleGetGlobal(&d_iGlobalTime, &bytes, module, "iGlobalTime")); checkCudaErrors(cuModuleGetGlobal(&d_iMouse, &bytes, module, "iMouse")); checkCudaErrors(cuModuleGetGlobal(&d_fragColor, &d_fragColor_bytes, module, "fragColor")); }
void const_copy_to(const char *name, void *host, size_t size) { CUdeviceptr mem; size_t bytes; cuda_push_context(); cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)) //assert(bytes == size); cuda_assert(cuMemcpyHtoD(mem, host, size)) cuda_pop_context(); }
void swanBindToConstantEx( const char *modname, const char *constname, size_t len , void *ptr ) { CUmodule mod = swanGetModule( modname ); CUdeviceptr p; size_t lenr; int err = cuModuleGetGlobal( &p, &lenr, mod, constname ); if( err != CUDA_SUCCESS ) { error ("swanBindToConstant failed -- no such name" ); } if( len != lenr ) { if( err != CUDA_SUCCESS ) { error ("swanBindToConstant failed -- size wrong" ); } } err = cuMemcpyHtoD( p, ptr, len ); if( err != CUDA_SUCCESS ) { error ("swanBindToConstant failed -- copy failed" ); } }
void CudaModuleScene::initCudaObj(ApexCudaVar& var) { const char* varName = var.getName(); for (int j = 0 ; j < numRegisteredVariables ; j++) { if (nvidia::strcmp(variableTable[j].varName, varName) == 0) { ApexCudaModule* cudaModule = getCudaModule(variableTable[j].modIndex); PX_ASSERT(cudaModule->isValid()); CUdeviceptr cuDevPtr; size_t size; cuModuleGetGlobal(&cuDevPtr, &size, cudaModule->getCuModule(), varName); var.init(this, cudaModule, cuDevPtr, size, getGpuDispatcher()->getCudaContextManager()); break; } } }
Buffer& CudaModule::getGlobal(const std::string& name) { Buffer* found = m_globalHash[name]; if (found) { return *found; } CUdeviceptr ptr; size_t size; checkError( "cuModuleGetGlobal", cuModuleGetGlobal(&ptr, &size, m_module, name.c_str())); Buffer* buffer = new Buffer; buffer->wrapCuda(ptr, size); m_globalHash[name] = buffer; return *buffer; }
CUresult CuModule::GetGlobal(const std::string& name, GlobalMemPtr* ppGlobal) { for(size_t i(0); i < _globals.size(); ++i) if(name == _globals[i]->Name()) { *ppGlobal = _globals[i]; return CUDA_SUCCESS; } GlobalMemPtr mem(new CuGlobalMem); CUresult result = cuModuleGetGlobal(&mem->_deviceptr, &mem->_size, _module, name.c_str()); HANDLE_RESULT(); mem->_context = _context; mem->_globalName = name; mem->_module = this; _globals.push_back(mem); *ppGlobal = mem; return CUDA_SUCCESS; }
SEXP R_auto_cuModuleGetGlobal(SEXP r_hmod, SEXP r_name) { SEXP r_ans = R_NilValue; CUdeviceptr dptr; size_t bytes; CUmodule hmod = (CUmodule) getRReference(r_hmod); const char * name = CHAR(STRING_ELT(r_name, 0)); CUresult ans; ans = cuModuleGetGlobal(& dptr, & bytes, hmod, name); if(ans) return(R_cudaErrorInfo(ans)); PROTECT(r_ans = NEW_LIST(2)); SEXP r_names; PROTECT(r_names = NEW_CHARACTER(2)); SET_VECTOR_ELT(r_ans, 0, R_createRef((void*) dptr, "CUdeviceptr")); SET_VECTOR_ELT(r_ans, 1, ScalarReal(bytes)); SET_STRING_ELT(r_names, 0, mkChar("dptr")); SET_STRING_ELT(r_names, 1, mkChar("bytes")); SET_NAMES(r_ans, r_names); UNPROTECT(2); return(r_ans); }
int GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, struct addr_pair **target_table) { CUmodule module; const char *const *var_names; const struct targ_fn_launch *fn_descs; unsigned int fn_entries, var_entries, i, j; CUresult r; struct targ_fn_descriptor *targ_fns; struct addr_pair *targ_tbl; const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data; struct ptx_image_data *new_image; struct ptx_device *dev; if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX) GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin" " (expected %u, received %u)", GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version)); GOMP_OFFLOAD_init_device (ord); dev = ptx_devices[ord]; nvptx_attach_host_thread_to_device (ord); link_ptx (&module, img_header->ptx_objs, img_header->ptx_num); /* The mkoffload utility emits a struct of pointers/integers at the start of each offload image. The array of kernel names and the functions addresses form a one-to-one correspondence. */ var_entries = img_header->var_num; var_names = img_header->var_names; fn_entries = img_header->fn_num; fn_descs = img_header->fn_descs; targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair) * (fn_entries + var_entries)); targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor) * fn_entries); *target_table = targ_tbl; new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data)); new_image->target_data = target_data; new_image->module = module; new_image->fns = targ_fns; pthread_mutex_lock (&dev->image_lock); new_image->next = dev->images; dev->images = new_image; pthread_mutex_unlock (&dev->image_lock); for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++) { CUfunction function; r = cuModuleGetFunction (&function, module, fn_descs[i].fn); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r)); targ_fns->fn = function; targ_fns->launch = &fn_descs[i]; targ_tbl->start = (uintptr_t) targ_fns; targ_tbl->end = targ_tbl->start + 1; } for (j = 0; j < var_entries; j++, targ_tbl++) { CUdeviceptr var; size_t bytes; r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); targ_tbl->start = (uintptr_t) var; targ_tbl->end = targ_tbl->start + bytes; } return fn_entries + var_entries; }
int main(int argc, char **argv) { //data CUdeviceptr d_data0 = 0; CUdeviceptr d_data1 = 0; DataStruct *h_data0 = 0; DataStruct *h_data1 = 0; DataStruct h_data_reference0; DataStruct h_data_reference1; unsigned int memSize = sizeof(DataStruct); //device references CUcontext hContext = 0; CUdevice hDevice = 0; CUmodule hModule = 0; CUstream hStream = 0; // Initialize the device and get a handle to the kernel CUresult status = initialize(0, &hContext, &hDevice, &hModule, &hStream); // Allocate memory on host and device if ((h_data0 = (DataStruct *)malloc(memSize)) == NULL) { std::cerr << "Could not allocate host memory" << std::endl; exit(-1); } status = cuMemAlloc(&d_data0, memSize); if ((h_data1 = (DataStruct *)malloc(memSize)) == NULL) { std::cerr << "Could not allocate host memory" << std::endl; exit(-1); } status = cuMemAlloc(&d_data1, memSize); if (status != CUDA_SUCCESS) printf("ERROR: during cuMemAlloc\n"); /////////////////////////////////////////////////////////////////////////////// //======================= test cases ========================================// /////////////////////////////////////////////////////////////////////////////// std::string name = ""; unsigned int testnum=0; unsigned int passed=0; //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ /////////////////////// Ralf /////////////////////////////////////////////////// //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ if(runRalfFunction("test_phi_scalar", test_phi_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi2_scalar", test_phi2_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi3_scalar", test_phi3_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi4_scalar", test_phi4_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi5_scalar", test_phi5_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi6_scalar", test_phi6_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi7_scalar", test_phi7_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi8_scalar", test_phi8_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi9_scalar", test_phi9_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_loopbad_scalar", test_loopbad_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_loop23_scalar", test_loop23_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_loop13_scalar", test_loop13_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; //////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_GetElementPointer_constant"; ///////////////////// setZero(h_data0,&h_data_reference0); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_GetElementPointer_constant(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; /////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_calculate"; ///////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 3; h_data0->f = h_data_reference0.f = 3.2; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_calculate(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; /////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_parquetShader"; ///////////////////// setZero(h_data0,&h_data_reference0); h_data0->f = h_data_reference0.f = 1; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_parquetShader(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; /////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_GetElementPointer_dyn"; ///////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 3; h_data0->u = h_data_reference0.u = 7; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_GetElementPointer_dyn(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; /////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_simple"; // Branch 1 ///////////////// setZero(h_data0,&h_data_reference0); h_data0->f = h_data_reference0.f = -4; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_simple(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; /////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_simple"; // Branch 2 ///////////////// setZero(h_data0,&h_data_reference0); h_data0->f = h_data_reference0.f = 8; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_simple(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_simplePHI"; // Branch 1 ///////////////// setZero(h_data0,&h_data_reference0); h_data0->f = h_data_reference0.f = -10; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_simplePHI(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_loop"; ////////////////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 100; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_loop(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_math"; ////////////////////////////////////////// setZero(h_data0,&h_data_reference0); h_data0->f = h_data_reference0.f = 1.4; h_data0->i = h_data_reference0.i = 3; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_math(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_signedOperands"; ////////////////////////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 3; h_data0->f = h_data_reference0.f = -7; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_signedOperands(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_constantOperands"; ////////////////////////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 3; h_data0->f = h_data_reference0.f = -1.44; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_constantOperands(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_loop_semihard"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 10; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_loop_semihard(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_loop_hard"; // Branch 1 ///////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 1; h_data0->u = h_data_reference0.u = 3; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_loop_hard(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*////////////*/ name = "test_branch_loop_hard"; // Branch 2 ///////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 7; h_data0->u = h_data_reference0.u = 10; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_loop_hard(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_binaryInst"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 5; h_data0->f = h_data_reference0.f = -121.23; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_binaryInst(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_selp"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = -15; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_selp(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_GetElementPointer_complicated"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 1; h_data_reference0.s.s.f = h_data0->s.s.f = 3.11; h_data_reference0.s.sa[2].f = h_data0->s.sa[2].f = -4.32; h_data_reference0.s.sa[h_data0->i].f = h_data0->s.sa[h_data0->i].f = 111.3; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_GetElementPointer_complicated(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_call"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 10; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_call(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*/////////////*/ name = "test_alloca"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 1; h_data0->f = h_data_reference0.f = -3.23; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_alloca(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_alloca_complicated"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 1; h_data0->f = h_data_reference0.f = 23.213; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_alloca_complicated(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_globalVariables"; ///////////////////////// setZero(h_data0,&h_data_reference0); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_globalVariables(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_specialRegisters_x"; ///////////////////////// setZero(h_data0,&h_data_reference0); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 2,3,4, 2,3); //run device function runHostTestFunction(test_specialRegisters_x, &h_data_reference0, 2,3,4, 2,3); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_specialRegisters_y"; ///////////////////////// setZero(h_data0,&h_data_reference0); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 2,3,4, 2,3); //run device function runHostTestFunction(test_specialRegisters_x, &h_data_reference0, 2,3,4, 2,3); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_dualArgument"; ///////////////////////// setZero(h_data0,&h_data_reference0); setZero(h_data1,&h_data_reference1); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunDualTestFunction(&hModule, name, d_data0, d_data1, h_data0, h_data1, memSize); //run device function test_dualArgument(&h_data_reference0,&h_data_reference1); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} if(compareData(h_data1,&h_data_reference1)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_vector"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->fa[0] = h_data_reference0.fa[0] = 0.43f; h_data0->fa[1] = h_data_reference0.fa[1] = 0.234f; h_data0->fa[2] = h_data_reference0.fa[2] = 12893.f; h_data0->fa[3] = h_data_reference0.fa[3] = 13.33f; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_vector(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_reg2Const"; ///////////////////////// setZero(h_data0,&h_data_reference0); /* unsigned int bytes; //size of constant CUdeviceptr devptr_const=0; status = cuModuleGetGlobal(&devptr_const, &bytes, hModule, "__ptx_constant_data_global"); cuMemcpyHtoD(devptr_const, h_data0, memSize); */ std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_reg2Const(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_constantMemory"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->fa[0] = __ptx_constant_data_global.fa[0] = 0.2348f; unsigned int bytes; //size of constant CUdeviceptr devptr_const=0; status = cuModuleGetGlobal(&devptr_const, &bytes, hModule, "__ptx_constant_data_global"); cuMemcpyHtoD(devptr_const, h_data0, memSize); setZero(h_data0,&h_data_reference0); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_constantMemory(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_sharedMemory"; ///////////////////////// setZero(h_data0,&h_data_reference0); for(int i = 0; i < ARRAY_N/2; i++) h_data0->fa[i*2] = i; for(int i = 0; i < ARRAY_N/2; i++) h_data0->fa[i*2+1] = -i; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 32,1,1, 1,1); //run device function for(int i = 0; i < ARRAY_N/2; i++) h_data_reference0.fa[i] = i; for(int i = 0; i < ARRAY_N/2; i++) h_data_reference0.fa[i+32] = -i; // runHostTestFunction(test_sharedMemory, &h_data_reference0, 16,1,1, 1,1); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_lightShader"; ///////////////////////// setZero(h_data0,&h_data_reference0); /* unsigned int bytes; //size of constant CUdeviceptr devptr_const=0; status = cuModuleGetGlobal(&devptr_const, &bytes, hModule, "__ptx_constant_data_global"); cuMemcpyHtoD(devptr_const, h_data0, memSize); */ std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function /* test_lightShader(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; */ /////////////////////////////////////////////////////////////////////////////// //======================= test cases END ====================================// /////////////////////////////////////////////////////////////////////////////// // Check the result std::cout << "\nPASSED " << passed << " tests" << std::endl; std::cout << "FAILED " << (testnum-passed) << " tests" << std::endl; // Cleanup if (d_data0) { cuMemFree(d_data0); d_data0 = 0; } if (d_data1) { cuMemFree(d_data1); d_data1 = 0; } if (h_data0) { free(h_data0); h_data0 = 0; } if (h_data1) { free(h_data1); h_data1 = 0; } if (hModule) { cuModuleUnload(hModule); hModule = 0; } if (hStream) { cuStreamDestroy(hStream); hStream = 0; } if (hContext) { cuCtxDestroy(hContext); hContext = 0; } return 0; }
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtQueueElement elem; while(virtqueue_pop(vq, &elem)) { struct param *p = elem.out_sg[0].iov_base; //for all library routines: get required arguments from buffer, execute, and push results back in virtqueue switch (p->syscall_type) { case CUINIT: { p->result = cuInit(p->flags); break; } case CUDRIVERGETVERSION: { p->result = cuDriverGetVersion(&p->val1); break; } case CUDEVICEGETCOUNT: { p->result = cuDeviceGetCount(&p->val1); break; } case CUDEVICEGET: { p->result = cuDeviceGet(&p->device, p->val1); break; } case CUDEVICECOMPUTECAPABILITY: { p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device); break; } case CUDEVICEGETNAME: { p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device); break; } case CUDEVICEGETATTRIBUTE: { p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device); break; } case CUCTXCREATE: { p->result = cuCtxCreate(&p->ctx, p->flags, p->device); break; } case CUCTXDESTROY: { p->result = cuCtxDestroy(p->ctx); break; } case CUCTXGETCURRENT: { p->result = cuCtxGetCurrent(&p->ctx); break; } case CUCTXGETDEVICE: { p->result = cuCtxGetDevice(&p->device); break; } case CUCTXPOPCURRENT: { p->result = cuCtxPopCurrent(&p->ctx); break; } case CUCTXSETCURRENT: { p->result = cuCtxSetCurrent(p->ctx); break; } case CUCTXSYNCHRONIZE: { p->result = cuCtxSynchronize(); break; } case CUMODULELOAD: { //hardcoded path - needs improvement //all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char)); if (!binname) { p->result = 0; virtqueue_push(vq, &elem, 0); break; } strcpy(binname, getenv("QEMU_NFS_PATH")); strcat(binname, (char *)elem.out_sg[1].iov_base); //change current CUDA context //each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleLoad(&p->module, binname); free(binname); break; } case CUMODULEGETGLOBAL: { char *name = malloc(100*sizeof(char)); if (!name) { p->result = 999; break; } strcpy(name, (char *)elem.out_sg[1].iov_base); p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name); break; } case CUMODULEUNLOAD: { p->result = cuModuleUnload(p->module); break; } case CUMEMALLOC: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAlloc(&p->dptr, p->bytesize); break; } case CUMEMALLOCPITCH: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize); break; } //large buffers are alocated in smaller chuncks in guest kernel space //gets each chunck seperately and copies it to device memory case CUMEMCPYHTOD: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYHTODASYNC: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTODASYNC: { p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream); break; } case CUMEMCPYDTOH: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTOHASYNC: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMSETD32: { p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize); break; } case CUMEMFREE: { p->result = cuMemFree(p->dptr); break; } case CUMODULEGETFUNCTION: { char *name = (char *)elem.out_sg[1].iov_base; name[p->length] = '\0'; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleGetFunction(&p->function, p->module, name); break; } case CULAUNCHKERNEL: { void **args = malloc(p->val1*sizeof(void *)); if (!args) { p->result = 9999; break; } int i; for (i=0; i<p->val1; i++) { args[i] = elem.out_sg[1+i].iov_base; } if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuLaunchKernel(p->function, p->gridDimX, p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, p->blockDimZ, p->bytecount, 0, args, 0); free(args); break; } case CUEVENTCREATE: { p->result = cuEventCreate(&p->event1, p->flags); break; } case CUEVENTDESTROY: { p->result = cuEventDestroy(p->event1); break; } case CUEVENTRECORD: { p->result = cuEventRecord(p->event1, p->stream); break; } case CUEVENTSYNCHRONIZE: { p->result = cuEventSynchronize(p->event1); break; } case CUEVENTELAPSEDTIME: { p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2); break; } case CUSTREAMCREATE: { p->result = cuStreamCreate(&p->stream, 0); break; } case CUSTREAMSYNCHRONIZE: { p->result = cuStreamSynchronize(p->stream); break; } case CUSTREAMQUERY: { p->result = cuStreamQuery(p->stream); break; } case CUSTREAMDESTROY: { p->result = cuStreamDestroy(p->stream); break; } default: printf("Unknown syscall_type\n"); } virtqueue_push(vq, &elem, 0); } //notify frontend - trigger virtual interrupt virtio_notify(vdev, vq); return; }
extern "C" void binomialOptionsGPU( real *callValue, TOptionData *optionData, int optN, int argc, char **argv ) { if (!moduleLoaded) { kernel_file = sdkFindFilePath("binomialOptions_kernel.cu", argv[0]); compileFileToPTX(kernel_file, 0, NULL, &ptx, &ptxSize); module = loadPTX(ptx, argc, argv); moduleLoaded = true; } __TOptionData h_OptionData[MAX_OPTIONS]; for (int i = 0; i < optN; i++) { const real T = optionData[i].T; const real R = optionData[i].R; const real V = optionData[i].V; const real dt = T / (real)NUM_STEPS; const real vDt = V * sqrt(dt); const real rDt = R * dt; //Per-step interest and discount factors const real If = exp(rDt); const real Df = exp(-rDt); //Values and pseudoprobabilities of upward and downward moves const real u = exp(vDt); const real d = exp(-vDt); const real pu = (If - d) / (u - d); const real pd = (real)1.0 - pu; const real puByDf = pu * Df; const real pdByDf = pd * Df; h_OptionData[i].S = (real)optionData[i].S; h_OptionData[i].X = (real)optionData[i].X; h_OptionData[i].vDt = (real)vDt; h_OptionData[i].puByDf = (real)puByDf; h_OptionData[i].pdByDf = (real)pdByDf; } CUfunction kernel_addr; checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "binomialOptionsKernel")); CUdeviceptr d_OptionData; checkCudaErrors(cuModuleGetGlobal(&d_OptionData, NULL, module, "d_OptionData")); checkCudaErrors(cuMemcpyHtoD(d_OptionData, h_OptionData, optN * sizeof(__TOptionData))); dim3 cudaBlockSize(128,1,1); dim3 cudaGridSize(optN, 1, 1); checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */ cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */ 0,0, /* shared mem, stream */ NULL, /* arguments */ 0)); checkCudaErrors(cuCtxSynchronize()); CUdeviceptr d_CallValue; checkCudaErrors(cuModuleGetGlobal(&d_CallValue, NULL, module, "d_CallValue")); checkCudaErrors(cuMemcpyDtoH(callValue, d_CallValue, optN *sizeof(real))); }
int gib_recover ( void *buffers, int buf_size, int *buf_ids, int recover_last, gib_context c ) { ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx)); #if !GIB_USE_MMAP if (buf_size > gib_buf_size) { int rc = gib_cpu_recover(buffers, buf_size, buf_ids, recover_last, c); ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return rc; } #endif int i, j; int n = c->n; int m = c->m; unsigned char A[128*128], inv[128*128], modA[128*128]; for (i = n; i < n+recover_last; i++) if (buf_ids[i] >= n) { fprintf(stderr, "Attempting to recover a parity buffer, not allowed\n"); return GIB_ERR; } gib_galois_gen_A(A, m+n, n); /* Modify the matrix to have the failed drives reflected */ for (i = 0; i < n; i++) for (j = 0; j < n; j++) modA[i*n+j] = A[buf_ids[i]*n+j]; gib_galois_gaussian_elim(modA, inv, n, n); /* Copy row buf_ids[i] into row i */ for (i = n; i < n+recover_last; i++) for (j = 0; j < n; j++) modA[i*n+j] = inv[buf_ids[i]*n+j]; int nthreads_per_block = 128; int fetch_size = sizeof(int)*nthreads_per_block; int nblocks = (buf_size + fetch_size - 1)/fetch_size; gpu_context gpu_c = (gpu_context) c->acc_context; CUdeviceptr F_d; ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d")); ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, modA+n*n, (c->m)*(c->n))); #if !GIB_USE_MMAP ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, (c->n)*buf_size)); #endif ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->recover, nthreads_per_block, 1, 1)); int offset = 0; void *ptr; #if GIB_USE_MMAP CUdeviceptr cpu_buffers; ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0)); ptr = (void *)cpu_buffers; #else ptr = (void *)gpu_c->buffers; #endif ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &ptr, sizeof(ptr))); offset += sizeof(ptr); ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &buf_size, sizeof(buf_size))); offset += sizeof(buf_size); ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &recover_last, sizeof(recover_last))); offset += sizeof(recover_last); ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->recover, offset)); ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->recover, nblocks, 1)); #if !GIB_USE_MMAP CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size; void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size); ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, recover_last*buf_size)); #else cuCtxSynchronize(); #endif ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return GIB_SUC; }
int gib_generate ( void *buffers, int buf_size, gib_context c ) { ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx)); /* Do it all at once if the buffers are small enough */ #if !GIB_USE_MMAP /* This is too large to do at once in the GPU memory we have allocated. * Split it into several noncontiguous jobs. */ if (buf_size > gib_buf_size) { int rc = gib_generate_nc(buffers, buf_size, buf_size, c); ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return rc; } #endif int nthreads_per_block = 128; int fetch_size = sizeof(int)*nthreads_per_block; int nblocks = (buf_size + fetch_size - 1)/fetch_size; gpu_context gpu_c = (gpu_context) c->acc_context; unsigned char F[256*256]; gib_galois_gen_F(F, c->m, c->n); CUdeviceptr F_d; ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d")); ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, F, (c->m)*(c->n))); #if !GIB_USE_MMAP /* Copy the buffers to memory */ ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, (c->n)*buf_size)); #endif /* Configure and launch */ ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->checksum, nthreads_per_block, 1, 1)); int offset = 0; void *ptr; #if GIB_USE_MMAP CUdeviceptr cpu_buffers; ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0)); ptr = (void *)cpu_buffers; #else ptr = (void *)(gpu_c->buffers); #endif ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &ptr, sizeof(ptr))); offset += sizeof(ptr); ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &buf_size, sizeof(buf_size))); offset += sizeof(buf_size); ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->checksum, offset)); ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->checksum, nblocks, 1)); /* Get the results back */ #if !GIB_USE_MMAP CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size; void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size); ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, (c->m)*buf_size)); #else ERROR_CHECK_FAIL(cuCtxSynchronize()); #endif ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return GIB_SUC; }
/* Initializes the CPU and GPU runtimes. */ int gib_init ( int n, int m, gib_context *c ) { static CUcontext pCtx; static CUdevice dev; if (m < 2 || n < 2) { fprintf(stderr, "It makes little sense to use Reed-Solomon coding when n or m is\n" "less than two. Use XOR or replication instead.\n"); exit(1); } int rc_i = gib_cpu_init(n,m,c); if (rc_i != GIB_SUC) { fprintf(stderr, "gib_cpu_init returned %i\n", rc_i); exit(EXIT_FAILURE); } int gpu_id = 0; if (!cudaInitialized) { /* Initialize the CUDA runtime */ int device_count; ERROR_CHECK_FAIL(cuInit(0)); ERROR_CHECK_FAIL(cuDeviceGetCount(&device_count)); if (getenv("GIB_GPU_ID") != NULL) { gpu_id = atoi(getenv("GIB_GPU_ID")); if (device_count <= gpu_id) { fprintf(stderr, "GIB_GPU_ID is set to an invalid value (%i). There are \n" "only %i GPUs in the system. Please specify another \n" "value.\n", gpu_id, device_count); exit(-1); } } cudaInitialized = 1; } ERROR_CHECK_FAIL(cuDeviceGet(&dev, gpu_id)); #if GIB_USE_MMAP ERROR_CHECK_FAIL(cuCtxCreate(&pCtx, CU_CTX_MAP_HOST, dev)); #else ERROR_CHECK_FAIL(cuCtxCreate(&pCtx, 0, dev)); #endif /* Initialize the Gibraltar context */ gpu_context gpu_c = (gpu_context) malloc(sizeof(struct gpu_context_t)); gpu_c->dev = dev; gpu_c->pCtx = pCtx; (*c)->acc_context = (void *)gpu_c; /* Determine whether the PTX has been generated or not by attempting to * open it read-only. */ if (getenv("GIB_CACHE_DIR") == NULL) { fprintf(stderr, "Your environment is not completely set. Please indicate a \n" "directory where generated files may be placed with the \n" "GIB_CACHE_DIR environment variable. This directory should\n" "not be publicly accessible and should exist.\n"); exit(-1); } /* Try to open the appropriate ptx file. If it doesn't exist, compile a * new one. */ int filename_len = strlen(getenv("GIB_CACHE_DIR")) + strlen("/gib_cuda_+.ptx") + log10(n)+1 + log10(m)+1 + 1; char *filename = (char *)malloc(filename_len); sprintf(filename, "%s/gib_cuda_%i+%i.ptx", getenv("GIB_CACHE_DIR"), n, m); FILE *fp = fopen(filename, "r"); if (fp == NULL) { /* Compile the ptx and open it */ int pid = fork(); if (pid == -1) { perror("Forking for nvcc"); exit(-1); } if (pid == 0) { gib_cuda_compile(n, m, filename); /* never returns */ } int status; wait(&status); if (status != 0) { printf("Waiting for the compiler failed.\n"); printf("The exit status was %i\n", WEXITSTATUS(status)); printf("The child did%s exit normally.\n", (WIFEXITED(status)) ? "" : " NOT"); exit(-1); } fp = fopen(filename, "r"); if (fp == NULL) { perror(filename); exit(-1); } } fclose(fp); /* If we got here, the ptx file exists. Use it. */ ERROR_CHECK_FAIL(cuModuleLoad(&(gpu_c->module), filename)); ERROR_CHECK_FAIL(cuModuleGetFunction(&(gpu_c->checksum), (gpu_c->module), "_Z14gib_checksum_dP11shmem_bytesi")); ERROR_CHECK_FAIL(cuModuleGetFunction(&(gpu_c->recover), (gpu_c->module), "_Z13gib_recover_dP11shmem_bytesii")); /* Initialize the math libraries */ gib_galois_init(); unsigned char F[256*256]; gib_galois_gen_F(F, m, n); /* Initialize/Allocate GPU-side structures */ CUdeviceptr log_d, ilog_d, F_d; ERROR_CHECK_FAIL(cuModuleGetGlobal(&log_d, NULL, gpu_c->module, "gf_log_d")); ERROR_CHECK_FAIL(cuMemcpyHtoD(log_d, gib_gf_log, 256)); ERROR_CHECK_FAIL(cuModuleGetGlobal(&ilog_d, NULL, gpu_c->module, "gf_ilog_d")); ERROR_CHECK_FAIL(cuMemcpyHtoD(ilog_d, gib_gf_ilog, 256)); ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d")); ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, F, m*n)); #if !GIB_USE_MMAP ERROR_CHECK_FAIL(cuMemAlloc(&(gpu_c->buffers), (n+m)*gib_buf_size)); #endif ERROR_CHECK_FAIL(cuCtxPopCurrent((&gpu_c->pCtx))); free(filename); return GIB_SUC; }