void Application::_init() { // Pick the best CUDA device const int deviceIdx = cutGetMaxGflopsDeviceId(); CudaSafeCall( cudaSetDevice( deviceIdx ) ); // CUDA configuration CudaSafeCall( cudaDeviceSetCacheConfig( cudaFuncCachePreferShared ) ); return; }
void initializeCUDARuntime(int device) { cudaSetDevice(device); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); // dummy memcpy to init cuda runtime util::Device1D<float> d_dummy(1); std::vector<float> h_dummy(1); d_dummy.copyFrom(h_dummy); if (cudaGetLastError() != cudaSuccess) throw std::runtime_error( std::string("initializeCUDARuntime: CUDA initialization problem\n")); }
cudaError_t GridGpu::prepareGPU(QVector<TrajPoint> &trajPoints) { createTrajBlocks(trajPoints); copyKernelData(); copyTrajBlocks(); mallocGpu(); cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); m_sharedSize = powf(ceilf((float)m_gridSize / m_gpuGridSize), 2) * sizeof(complexGpu); qWarning() << "Shared mem size:" << m_sharedSize; cudaError_t status = cudaGetLastError(); if (status != cudaSuccess) qWarning() << cudaGetErrorString(status); }
// selects GPU to use and returns gpu ID or -1 if using CPU int init_cuda() { // Select the proper device const char* devstr = getenv("CUDA_DEVICE"); const int env_dev = (devstr != NULL) ? atoi(devstr) : 0; int dev = env_dev; int devcnt; ebf::cudaErrCheck( cudaGetDeviceCount(&devcnt) ); if( dev >= 0 && dev < devcnt ) { ebf::cudaErrCheck( cudaSetDevice(dev) ); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); } else { dev = -1; std::cerr << "# Cannot select the CUDA device. Using CPU!" << std::endl; } return dev; }
/** * The intent is to tell CUDA driver to use more cache. But it does * not improve performance all the time. * */ void set_more_cache(){ cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); }
int main(int argc, char ** argv) { std::vector<std::string> parameters; parameters.push_back(std::string(argv[1])); parameters.push_back(std::string(argv[2])); device = eqMivt::getBestDevice(); cudaFuncCache cacheConfig = cudaFuncCachePreferL1; if (cudaSuccess != cudaSetDevice(device) || cudaSuccess != cudaDeviceSetCacheConfig(cacheConfig)) { std::cerr<<"Error setting up best device"<<std::endl; return 0; } std::string colorF = ""; if (argc == 5) { try { std::string n(argv[4]); mO = boost::lexical_cast<double>(n); } catch(...) { colorF = argv[4]; } } else if (argc == 6) { try { std::string n(argv[5]); mO = boost::lexical_cast<double>(n); } catch(...) { colorF = argv[4]; } } if (!rM.init(parameters, argv[3], colorF, mO)) { std::cerr<<"Error init resources manager"<<std::endl; return 0; } if (!rM.start()) { std::cerr<<"Error start resources manager"<<std::endl; return 0; } std::cout<<"============ Creating pictures ============"<<std::endl; if (test2()) { std::cout<<"Test ok"<<std::endl; } else { std::cout<<"Test Fail"<<std::endl; } rM.destroy(); std::cout<<"End test"<<std::endl; }
extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata, uint32_t *ptarget, uint32_t max_nonce, uint32_t *hashes_done) { static THREAD uint32_t *foundNounce = nullptr; uint32_t start_nonce = pdata[19]; unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 24 : 23; uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << intensity); uint32_t throughput = min(throughputmax, max_nonce - start_nonce) & 0xfffffc00; if (opt_benchmark) ptarget[7] = 0x0000000f; // init static THREAD volatile bool init = false; if(!init) { CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); groestlcoin_cpu_init(thr_id, throughputmax); CUDA_SAFE_CALL(cudaMallocHost(&foundNounce, 2 * 4)); init = true; } // Endian Drehung ist notwendig uint32_t endiandata[32]; for (int kk=0; kk < 32; kk++) be32enc(&endiandata[kk], pdata[kk]); // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt) groestlcoin_cpu_setBlock(thr_id, endiandata); do { // GPU const uint32_t Htarg = ptarget[7]; groestlcoin_cpu_hash(thr_id, throughput, pdata[19], foundNounce, ptarget[7]); if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} if(foundNounce[0] < 0xffffffff) { uint32_t tmpHash[8]; endiandata[19] = SWAP32(foundNounce[0]); groestlhash(tmpHash, endiandata); if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget)) { int res = 1; if(opt_benchmark) applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], foundNounce[0]); *hashes_done = pdata[19] - start_nonce + throughput; if(foundNounce[1] != 0xffffffff) { endiandata[19] = SWAP32(foundNounce[1]); groestlhash(tmpHash, endiandata); if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget)) { pdata[21] = foundNounce[1]; res++; if(opt_benchmark) applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], foundNounce[1]); } else { if(tmpHash[7] != Htarg) { applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[1]); } } } pdata[19] = foundNounce[0]; return res; } else { if(tmpHash[7] != Htarg) { applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[0]); } } } pdata[19] += throughput; cudaError_t err = cudaGetLastError(); if(err != cudaSuccess) { applog(LOG_ERR, "GPU #%d: %s", device_map[thr_id], cudaGetErrorString(err)); exit(EXIT_FAILURE); } } while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); *hashes_done = pdata[19] - start_nonce; return 0; }