// if we use 2 threads on the same gpu, we need to reinit the threads void cuda_reset_device(int thr_id, bool *init) { int dev_id = device_map[thr_id % MAX_GPUS]; cudaSetDevice(dev_id); if (init != NULL) { // with init array, its meant to be used in algo's scan code... for (int i=0; i < MAX_GPUS; i++) { if (device_map[i] == dev_id) { init[i] = false; } } // force exit from algo's scan loops/function restart_threads(); cudaDeviceSynchronize(); while (cudaStreamQuery(NULL) == cudaErrorNotReady) usleep(1000); } cudaDeviceReset(); if (opt_cudaschedule >= 0) { cudaSetDeviceFlags((unsigned)(opt_cudaschedule & cudaDeviceScheduleMask)); } else { cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); } cudaDeviceSynchronize(); }
void MFNHashTypePlainCUDA::setupDevice() { trace_printf("CHHashTypeVPlainCUDA::setupDevice()\n"); CHCUDAUtils *CudaUtils = MultiforcerGlobalClassFactory.getCudaUtilsClass(); // Set the CUDA device trace_printf("Thread %d setting device to %d\n",this->threadId, this->gpuDeviceId); cudaSetDevice(this->gpuDeviceId); // If the user requests zerocopy and the device can handle it, add it. if (this->CommandLineData->GetUseZeroCopy() && CudaUtils->getCudaCanMapHostMemory(this->gpuDeviceId)) { this->useZeroCopy = 1; } // If the device is integrated & can map memory, add it - integrated devices // are already sharing host memory, so no point in copying the data over. if (CudaUtils->getCudaIsIntegrated(this->gpuDeviceId) && CudaUtils->getCudaCanMapHostMemory(this->gpuDeviceId)) { this->useZeroCopy = 1; } // Enable blocking sync. This dramatically reduces CPU usage. // If zero copy is being used, set DeviceMapHost as well if (this->useZeroCopy) { cudaSetDeviceFlags(cudaDeviceBlockingSync | cudaDeviceMapHost); } else { cudaSetDeviceFlags(cudaDeviceBlockingSync); } }
void THInit() { static int init; if(init) return; init_yuv2rgb(); #ifndef USEBLAS blas_init(); #endif init = 1; #if defined CUDNN && defined USECUDAHOSTALLOC // cuda_maphostmem = 1 requires that memory was allocated with cudaHostAlloc // cuda_maphostmem = 2 will work with malloc, but Tegra TX1 does not support cudaHostRegister with cudaHostRegisterMapped struct cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); if(prop.canMapHostMemory) { errcheck(cudaSetDeviceFlags(cudaDeviceMapHost)); cuda_maphostmem = 1; } #endif #ifdef OPENCL thopencl_init(); #endif }
void setDeviceFlags(unsigned int flags) { cudaError_t cudaError = cudaSetDeviceFlags(flags); if(cudaError != cudaSuccess) { throw cudaError; } }
Args<T>::Args(int _width){ width = _width; gpuErrchk( cudaDeviceReset() ); gpuErrchk( cudaSetDeviceFlags(cudaDeviceMapHost) ); gpuErrchk( cudaHostAlloc((void **) &hostArray, width*sizeof(T), cudaHostAllocWriteCombined | cudaHostAllocMapped) ); gpuErrchk( cudaHostGetDevicePointer(&deviceArray, hostArray, 0) ); }
int main(int argc, char **argv) { int gpu = 0; char *str = getenv("MV2_COMM_WORLD_LOCAL_RANK"); if (!str) str =getenv("OMPI_COMM_WORLD_LOCAL_RANK"); if (str) gpu=atoi(str); str =getenv("MV2_COMM_WORLD_RANK"); if (!str) str =getenv("OMPI_COMM_WORLD_RANK"); int s_env = 0; if (str) s_env = atoi(str); printf("Rank %d running on GPU %d\n",s_env, gpu); cudaSetDeviceFlags(cudaDeviceMapHost); cudaSetDevice(gpu); /* initialize context */ cudaFree(0); putenv(enable_cuda); putenv(enable_threads); MPI_Init(&argc, &argv); int s,p; MPI_Comm_rank(MPI_COMM_WORLD, &s); MPI_Comm_size(MPI_COMM_WORLD, &p); /* use a basic grid - processor mapping */ proc_map = malloc(sizeof(int)*p); int i; for (i = 0; i < p; i++) proc_map[i] = i; /* basic test in n = 1 .. 7 dimensions */ int nd; for (nd = 1; nd <= 7; nd++) { if (!s) printf("Testing distributed FFT in %d dimensions ...\n",nd); test_distributed_fft_nd(nd); } if (!s) printf("Compare against KISS FFT (d=1)...\n"); for (i = 1; i < 14; ++i) /*for (i = 1; i < 24; ++i) */ { int n = (1 << i); if (n <= p) continue; if (!s) printf("N=%d\n",n); test_distributed_fft_1d_compare(n); } if (!s) printf("Compare against KISS FFT (d=3)... \n"); test_distributed_fft_3d_compare(); free(proc_map); MPI_Finalize(); }
int main() { time_t start = time(NULL); int dim = L * (nmax + 1); const real epsg = EPSG; const real epsf = EPSF; const real epsx = EPSX; const int maxits = MAXITS; stpscal = 0.5; int info; real* x; int* nbd; real* l; real* u; memAlloc<real>(&x, dim); memAlloc<int>(&nbd, dim); memAlloc<real>(&l, dim); memAlloc<real>(&u, dim); memAllocHost<real>(&f_tb_host, &f_tb_dev, 1); cudaSetDeviceFlags(cudaDeviceMapHost); cublasCreate_v2(&cublasHd); U = 1; J = 0.1; mu = 0.5; initProb(x, nbd, l, u, dim); lbfgsbminimize(dim, 4, x, epsg, epsf, epsx, maxits, nbd, l, u, info); printf("info: %d\n", info); printf("f: %e\n", *f_tb_host); real* x_host = new real[dim]; memCopy(x_host, x, dim * sizeof(real), cudaMemcpyDeviceToHost); printf("x: "); for (int i = 0; i < dim; i++) { printf("%f, ", x_host[i]); } printf("\n"); memFreeHost(f_tb_host); memFree(x); memFree(nbd); memFree(l); memFree(u); cublasDestroy_v2(cublasHd); cudaDeviceReset(); time_t end = time(NULL); printf("Runtime: %ld", end-start); }
bool initCUDA(void) { #if __DEVICE_EMULATION__ return true; #else int count = 0; int i = 0; cudaGetDeviceCount(&count); if(count == 0) { fprintf(stderr, "Nu exista nici un device.\n"); return false; } printf("Exista %d device-uri.\n",count); for(i = 0; i < count; i++) { cudaDeviceProp prop; if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if(prop.major >= 1) { break; } } if(!prop.canMapHostMemory) { fprintf(stderr, "Device %d cannot map host memory!\n",0); exit(EXIT_FAILURE); } } if(i == count) { fprintf(stderr, "Nu exista nici un device care suporta CUDA.\n"); return false; } cudaSetDevice(cutGetMaxGflopsDeviceId()); cudaSetDeviceFlags(cudaDeviceMapHost); checkCUDAError("cudaSetDeviceFlags"); printf("CUDA initializat cu succes\n"); // Create the CUTIL timer cutilCheckError( cutCreateTimer( &timer)); return true; #endif }
int main(int argc, char ** argv) { srand(static_cast<unsigned int>(time(0))); std::cout.setf(std::ios_base::scientific, std::ios_base::floatfield); std::cout.setf(std::ios_base::showpos); std::cout.setf(std::ios_base::showpoint); std::cout.precision(7); #ifdef _CUDA_ checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost)); #endif #ifdef Test return Test_Func(); #else return Filtering(argc, argv); #endif }
void setDevice(int deviceNumber) { int num_gpus = 0; //count of gpus cudaGetDeviceCount(&num_gpus); //##ERROR handling if (num_gpus < 1) //check if cuda device ist found { throw std::runtime_error("no CUDA capable devices detected"); } else if (num_gpus < deviceNumber) //check if i can select device with diviceNumber { std::cerr << "no CUDA device " << deviceNumber << ", only " << num_gpus << " devices found" << std::endl; throw std::runtime_error("CUDA capable devices can't be selected"); } cudaDeviceProp devProp; cudaError rc; CUDA_CHECK(cudaGetDeviceProperties(&devProp, deviceNumber)); if (devProp.computeMode == cudaComputeModeDefault) { CUDA_CHECK(rc = cudaSetDevice(deviceNumber)); if (cudaSuccess == rc) { cudaDeviceProp dprop; cudaGetDeviceProperties(&dprop, deviceNumber); //!\todo: write this only on debug log<ggLog::CUDA_RT > ("Set device to %1%: %2%") % deviceNumber % dprop.name; } } else { //gpu mode is cudaComputeModeExclusiveProcess and a free device is automaticly selected. log<ggLog::CUDA_RT > ("Device is selected by CUDA automaticly. (because cudaComputeModeDefault is not set)"); } CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceScheduleYield)); }
////////////////////////////////////////////////////////////////////////////// // Program main ////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { bool bTestResults = true; if (checkCmdLineFlag(argc, (const char **)argv, "help")) { printf("\n> Command line options\n"); showHelp(); return 0; } printf("Run \"nbody -benchmark [-numbodies=<numBodies>]\" to measure perfomance.\n"); showHelp(); bFullscreen = (checkCmdLineFlag(argc, (const char **) argv, "fullscreen") != 0); if (bFullscreen) { bShowSliders = false; } benchmark = (checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0); compareToCPU = ((checkCmdLineFlag(argc, (const char **) argv, "compare") != 0) || (checkCmdLineFlag(argc, (const char **) argv, "qatest") != 0)); QATest = (checkCmdLineFlag(argc, (const char **) argv, "qatest") != 0); useHostMem = (checkCmdLineFlag(argc, (const char **) argv, "hostmem") != 0); fp64 = (checkCmdLineFlag(argc, (const char **) argv, "fp64") != 0); flopsPerInteraction = fp64 ? 30 : 20; useCpu = (checkCmdLineFlag(argc, (const char **) argv, "cpu") != 0); if (checkCmdLineFlag(argc, (const char **)argv, "numdevices")) { numDevsRequested = getCmdLineArgumentInt(argc, (const char **) argv, "numdevices"); if (numDevsRequested < 1) { printf("Error: \"number of CUDA devices\" specified %d is invalid. Value should be >= 1\n", numDevsRequested); exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); } else { printf("number of CUDA devices = %d\n", numDevsRequested); } } // for multi-device we currently require using host memory -- the devices share // data via the host if (numDevsRequested > 1) { useHostMem = true; } int numDevsAvailable = 0; bool customGPU = false; cudaGetDeviceCount(&numDevsAvailable); if (numDevsAvailable < numDevsRequested) { printf("Error: only %d Devices available, %d requested. Exiting.\n", numDevsAvailable, numDevsRequested); exit(EXIT_SUCCESS); } printf("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed"); printf("> Simulation data stored in %s memory\n", useHostMem ? "system" : "video"); printf("> %s precision floating point simulation\n", fp64 ? "Double" : "Single"); printf("> %d Devices used for simulation\n", numDevsRequested); int devID; cudaDeviceProp props; if (useCpu) { useHostMem = true; compareToCPU = false; bSupportDouble = true; #ifdef OPENMP printf("> Simulation with CPU using OpenMP\n"); #else printf("> Simulation with CPU\n"); #endif } // Initialize GL and GLUT if necessary if (!benchmark && !compareToCPU) { initGL(&argc, argv); initParameters(); } if(!useCpu) { // Now choose the CUDA Device // Either without GL interop: if (benchmark || compareToCPU || useHostMem) { // Note if we are using host memory for the body system, we // don't use CUDA-GL interop. if (checkCmdLineFlag(argc, (const char **)argv, "device")) { customGPU = true; } devID = findCudaDevice(argc, (const char **)argv); } else // or with GL interop: { if (checkCmdLineFlag(argc, (const char **)argv, "device")) { customGPU = true; } devID = findCudaGLDevice(argc, (const char **)argv); } checkCudaErrors(cudaGetDevice(&devID)); checkCudaErrors(cudaGetDeviceProperties(&props, devID)); bSupportDouble = true; #if CUDART_VERSION < 4000 if (numDevsRequested > 1) { printf("MultiGPU n-body requires CUDA 4.0 or later\n"); cudaDeviceReset(); exit(EXIT_SUCCESS); } #endif // Initialize devices if (numDevsRequested > 1 && customGPU) { printf("You can't use --numdevices and --device at the same time.\n"); exit(EXIT_SUCCESS); } if (customGPU) { cudaDeviceProp props; checkCudaErrors(cudaGetDeviceProperties(&props, devID)); printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); } else { for (int i = 0; i < numDevsRequested; i++) { cudaDeviceProp props; checkCudaErrors(cudaGetDeviceProperties(&props, i)); printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); if (useHostMem) { #if CUDART_VERSION >= 2020 if (!props.canMapHostMemory) { fprintf(stderr, "Device %d cannot map host memory!\n", devID); cudaDeviceReset(); exit(EXIT_SUCCESS); } if (numDevsRequested > 1) { checkCudaErrors(cudaSetDevice(i)); } checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost)); #else fprintf(stderr, "This CUDART version does not support <cudaDeviceProp.canMapHostMemory> field\n"); cudaDeviceReset(); exit(EXIT_SUCCESS); #endif } } // CC 1.2 and earlier do not support double precision if (props.major*10 + props.minor <= 12) { bSupportDouble = false; } } //if(numDevsRequested > 1) // checkCudaErrors(cudaSetDevice(devID)); if (fp64 && !bSupportDouble) { fprintf(stderr, "One or more of the requested devices does not support double precision floating-point\n"); cudaDeviceReset(); exit(EXIT_SUCCESS); } } numIterations = 0; p = 0; q = 1; if (checkCmdLineFlag(argc, (const char **)argv, "i")) { numIterations = getCmdLineArgumentInt(argc, (const char **)argv, "i"); } if (checkCmdLineFlag(argc, (const char **) argv, "p")) { p = getCmdLineArgumentInt(argc, (const char **)argv, "p"); } if (checkCmdLineFlag(argc, (const char **) argv, "q")) { q = getCmdLineArgumentInt(argc, (const char **)argv, "q"); } if (p == 0) // p not set on command line { p = 256; if (q * p > 256) { p = 256 / q; printf("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256); } } // default number of bodies is #SMs * 4 * CTA size if (useCpu) #ifdef OPENMP numBodies = 8192; #else numBodies = 4096; #endif else if (numDevsRequested == 1)
static void init_cu2_device(QSP_ARG_DECL int index, Compute_Platform *cpp) { struct cudaDeviceProp deviceProp; cudaError_t e; Platform_Device *pdp; char name[LLEN]; char dev_name[LLEN]; char area_name[LLEN]; const char *name_p; char *s; Data_Area *ap; float comp_cap; if( index >= MAX_CUDA_DEVICES ){ sprintf(ERROR_STRING,"Program is compiled for a maximum of %d CUDA devices, can't inititialize device %d.", MAX_CUDA_DEVICES,index); ERROR1(ERROR_STRING); } if( verbose ){ sprintf(ERROR_STRING,"init_cu2_device %d BEGIN",index); advise(ERROR_STRING); } if( (e=cudaGetDeviceProperties(&deviceProp, index)) != cudaSuccess ){ describe_cuda_driver_error2("init_cu2_device","cudaGetDeviceProperties",e); return; } if (deviceProp.major == 9999 && deviceProp.minor == 9999){ sprintf(ERROR_STRING,"There is no CUDA device with dev = %d!?.\n",index); WARN(ERROR_STRING); /* What should we do here??? */ return; } /* Put the compute capability into a script variable so that we can use it */ comp_cap = deviceProp.major * 10 + deviceProp.minor; if( comp_cap > CUDA_COMP_CAP ){ sprintf(ERROR_STRING,"init_cu2_device: CUDA device %s has compute capability %d.%d, but program was configured for %d.%d!?", deviceProp.name,deviceProp.major,deviceProp.minor, CUDA_COMP_CAP/10,CUDA_COMP_CAP%10); WARN(ERROR_STRING); } /* BUG if there are multiple devices, we need to make sure that this is set * correctly for the current context!? */ sprintf(ERROR_STRING,"%d.%d",deviceProp.major,deviceProp.minor); assign_var(QSP_ARG "cuda_comp_cap",ERROR_STRING); /* What does this do??? */ e = cudaSetDeviceFlags( cudaDeviceMapHost ); if( e != cudaSuccess ){ describe_cuda_driver_error2("init_cu2_device", "cudaSetDeviceFlags",e); } strcpy(name,deviceProp.name); /* change spaces to underscores */ s=name; while(*s){ if( *s==' ' ) *s='_'; s++; } /* We might have two of the same devices installed in a single system. * In this case, we can't use the device name twice, because there will * be a conflict. The first one gets the name, then we have to check and * make sure that the name is not in use already. If it is, then we append * a number to the string... */ name_p = available_pfdev_name(QSP_ARG name,dev_name,cpp,MAX_CUDA_DEVICES); // reuse name as scratch string pdp = new_pfdev(QSP_ARG name_p); #ifdef CAUTIOUS if( pdp == NO_PFDEV ){ sprintf(ERROR_STRING,"CAUTIOUS: init_cu2_device: Error creating cuda device struct for %s!?",name_p); WARN(ERROR_STRING); return; } #endif /* CAUTIOUS */ /* Remember this name in case the default is not found */ if( first_cuda_dev_name == NULL ) first_cuda_dev_name = PFDEV_NAME(pdp); /* Compare this name against the default name set in * the environment, if it exists... */ if( default_cuda_dev_name != NULL && ! default_cuda_dev_found ){ if( !strcmp(PFDEV_NAME(pdp),default_cuda_dev_name) ) default_cuda_dev_found=1; } SET_PFDEV_PLATFORM(pdp,cpp); SET_PFDEV_CUDA_INFO( pdp, getbuf(sizeof(Cuda_Dev_Info)) ); SET_PFDEV_CUDA_DEV_INDEX(pdp,index); SET_PFDEV_CUDA_DEV_PROP(pdp,deviceProp); SET_PFDEV_CUDA_RNGEN(pdp,NULL); if( comp_cap >= 20 ){ SET_PFDEV_MAX_DIMS(pdp,3); } else { SET_PFDEV_MAX_DIMS(pdp,2); } //set_cuda_device(pdp); // is this call just so we can call cudaMalloc? PF_FUNC_NAME(set_device)(QSP_ARG pdp); // is this call just so we can call cudaMalloc? // address set to NULL says use custom allocator - see dobj/makedobj.c // BUG?? with pdp we may not need the DA_ flag??? sprintf(area_name,"%s.%s",PLATFORM_NAME(cpp),name_p); ap = pf_area_init(QSP_ARG area_name,NULL,0, MAX_CUDA_GLOBAL_OBJECTS,DA_CUDA_GLOBAL,pdp); if( ap == NO_AREA ){ sprintf(ERROR_STRING, "init_cu2_device: error creating global data area %s",area_name); WARN(ERROR_STRING); } // g++ won't take this line!? SET_AREA_CUDA_DEV(ap,pdp); //set_device_for_area(ap,pdp); SET_PFDEV_AREA(pdp,PFDEV_GLOBAL_AREA_INDEX,ap); /* We used to declare a heap for constant memory here, * but there wasn't much of a point because: * Constant memory can't be allocated, rather it is declared * in the .cu code, and placed by the compiler as it sees fit. * To have objects use this, we would have to declare a heap and * manage it ourselves... * There's only 64k, so we should be sparing... * We'll try this later... */ /* Make up another area for the host memory * which is locked and mappable to the device. * We don't allocate a pool here, but do it as needed... */ //strcpy(area_name,name_p); //strcat(area_name,"_host"); sprintf(area_name,"%s.%s_host",PLATFORM_NAME(cpp),name_p); ap = pf_area_init(QSP_ARG area_name,(u_char *)NULL,0,MAX_CUDA_MAPPED_OBJECTS, DA_CUDA_HOST,pdp); if( ap == NO_AREA ){ sprintf(ERROR_STRING, "init_cu2_device: error creating host data area %s",area_name); ERROR1(ERROR_STRING); } SET_AREA_CUDA_DEV(ap, pdp); //cuda_data_area[index][CUDA_HOST_AREA_INDEX] = ap; SET_PFDEV_AREA(pdp,PFDEV_HOST_AREA_INDEX,ap); /* Make up another psuedo-area for the mapped host memory; * This is the same memory as above, but mapped to the device. * In the current implementation, we create objects in the host * area, and then automatically create an alias on the device side. * There is a BUG in that by having this psuedo area in the data * area name space, a user could select it as the data area and * then try to create an object. We will detect this in make_dobj, * and complain. */ //strcpy(area_name,name_p); //strcat(area_name,"_host_mapped"); sprintf(area_name,"%s.%s_host_mapped",PLATFORM_NAME(cpp),name_p); ap = pf_area_init(QSP_ARG area_name,(u_char *)NULL,0,MAX_CUDA_MAPPED_OBJECTS, DA_CUDA_HOST_MAPPED,pdp); if( ap == NO_AREA ){ sprintf(ERROR_STRING, "init_cu2_device: error creating host-mapped data area %s",area_name); ERROR1(ERROR_STRING); } SET_AREA_CUDA_DEV(ap,pdp); //cuda_data_area[index][CUDA_HOST_MAPPED_AREA_INDEX] = ap; SET_PFDEV_AREA(pdp,PFDEV_HOST_MAPPED_AREA_INDEX,ap); // We don't change the data area by default any more when initializing... /* Restore the normal area */ //set_data_area(PFDEV_AREA(pdp,PFDEV_GLOBAL_AREA_INDEX)); if( verbose ){ sprintf(ERROR_STRING,"init_cu2_device %d DONE",index); advise(ERROR_STRING); } }
void InitializeGlut(int *argc, char *argv[]) { int i,j; glutInit(argc, argv); glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH); glutInitWindowSize(screenwidth, screenheight); glutCreateWindow(argv[0]); glutDisplayFunc(Display); glutKeyboardFunc(Keyboard); // Support mapped pinned allocations cudaSetDeviceFlags(cudaDeviceMapHost); cudaGLSetGLDevice(0); cublasCreate_v2(&cublasHd); glewInit(); GLint max_texture_size; glGetIntegerv(GL_MAX_TEXTURE_SIZE, &max_texture_size); if(max_texture_size < screenwidth || screenwidth < screenheight) { printf("Max size of texttur(%d) is less than screensize(%d, %d)\n", max_texture_size, screenwidth, screenheight); exit(0); } //Create the textures glActiveTextureARB(GL_TEXTURE0_ARB); // 처리용 텍스쳐 2장 // Q. 왜 2장일까? glGenTextures(2, Processed_Texture); glBindTexture(GL_TEXTURE_RECTANGLE_NV, Processed_Texture[0]); glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA32F_ARB, screenwidth+2, screenheight+2, 0, GL_RGBA, GL_FLOAT, NULL); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP); glBindTexture(GL_TEXTURE_RECTANGLE_NV, Processed_Texture[1]); glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA32F_ARB, screenwidth+2, screenheight+2, 0, GL_RGBA, GL_FLOAT, NULL); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP); // Site용 텍스쳐 // Q. 처리용과 별개인 이유는? glGenTextures(1, &Site_Texture); glBindTexture(GL_TEXTURE_RECTANGLE_NV, Site_Texture); glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA32F_ARB, screenwidth+2, screenheight+2, 0, GL_RGBA, GL_FLOAT, NULL); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP); // Registers the texture or renderbuffer object specified by image for access by CUDA. // A handle to the registered object is returned as resource cutilSafeCall(cudaGraphicsGLRegisterImage(&grSite, Site_Texture, GL_TEXTURE_RECTANGLE_NV, cudaGraphicsMapFlagsReadOnly)); // 에너지용 텍스쳐 // 처리용과 동일한 2장 // Q. 왜?? glGenTextures(2, Energy_Texture); glBindTexture(GL_TEXTURE_RECTANGLE_NV, Energy_Texture[0]); glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA32F_ARB, screenwidth+2, screenheight+2, 0, GL_RGBA, GL_FLOAT, NULL); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP); glBindTexture(GL_TEXTURE_RECTANGLE_NV, Energy_Texture[1]); glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA32F_ARB, screenwidth+2, screenheight+2, 0, GL_RGBA, GL_FLOAT, NULL); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP); // 인덱스용 텍스쳐 // 인덱스를 컬러로 표현 glGenTextures(1, &IndexColor_Texture); glBindTexture(GL_TEXTURE_RECTANGLE_NV, IndexColor_Texture); glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA, screenwidth, screenheight, 0, GL_RGBA, GL_FLOAT, NULL); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP); // Render Buffer Object glGenFramebuffersEXT(1, &RB_object); glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, RB_object); glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_RGBA32F_ARB, screenwidth+2, screenheight+2); // Frame(?) Buffer Object glGenFramebuffersEXT(1, &FB_objects); // ???? // NVIDIA 확인이라는 점만 확인 // http://developer.download.nvidia.com/opengl/specs/nvOpenGLspecs.pdf glGetQueryiv(GL_SAMPLES_PASSED_ARB, GL_QUERY_COUNTER_BITS_ARB, &oq_bitsSupported); glGenQueriesARB(1, &occlusion_query); InitCg(); // 미리 컴파일된 화면 픽셀 목록 ScreenPointsList = glGenLists(1); glNewList(ScreenPointsList, GL_COMPILE); glBegin(GL_POINTS); for (i=0; i<screenwidth; i++) for (j=0; j<screenheight; j++) glVertex2f(i+1.5, j+1.5); glEnd(); glEndList(); }
int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done) { uint32_t _ALIGN(64) endiandata[32]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; uint32_t start_nonce = pdata[19]; uint32_t throughput = cuda_default_throughput(thr_id, 1U << 17); if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce); uint32_t *outputHash = (uint32_t*)malloc(throughput * 64); if (opt_benchmark) ptarget[7] = 0x0000ff; // init if(!init[thr_id]) { cudaSetDevice(device_map[thr_id]); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); // reduce cpu usage cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); CUDA_LOG_ERROR(); } myriadgroestl_cpu_init(thr_id, throughput); init[thr_id] = true; } for (int k=0; k < 20; k++) be32enc(&endiandata[k], pdata[k]); // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt) myriadgroestl_cpu_setBlock(thr_id, endiandata, (void*)ptarget); do { // GPU uint32_t foundNounce = UINT32_MAX; myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce); *hashes_done = pdata[19] - start_nonce + throughput; if (foundNounce < UINT32_MAX && bench_algo < 0) { uint32_t _ALIGN(64) vhash[8]; endiandata[19] = swab32(foundNounce); myriadhash(vhash, endiandata); if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { work_set_target_ratio(work, vhash); pdata[19] = foundNounce; free(outputHash); return 1; } else { gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNounce); } } if ((uint64_t) throughput + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } pdata[19] += throughput; } while (!work_restart[thr_id].restart); *hashes_done = max_nonce - start_nonce; free(outputHash); return 0; }
PetscErrorCode PetscOptionsCheckInitial_Private(void) { char string[64],mname[PETSC_MAX_PATH_LEN],*f; MPI_Comm comm = PETSC_COMM_WORLD; PetscBool flg1 = PETSC_FALSE,flg2 = PETSC_FALSE,flg3 = PETSC_FALSE,flag; PetscErrorCode ierr; PetscReal si; PetscInt intensity; int i; PetscMPIInt rank; char version[256]; #if !defined(PETSC_HAVE_THREADSAFETY) PetscReal logthreshold; #endif #if defined(PETSC_USE_LOG) PetscViewerFormat format; PetscBool flg4 = PETSC_FALSE; #endif PetscFunctionBegin; ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr); #if !defined(PETSC_HAVE_THREADSAFETY) /* Setup the memory management; support for tracing malloc() usage */ ierr = PetscOptionsHasName(NULL,"-malloc_log",&flg3);CHKERRQ(ierr); logthreshold = 0.0; ierr = PetscOptionsGetReal(NULL,"-malloc_log_threshold",&logthreshold,&flg1);CHKERRQ(ierr); if (flg1) flg3 = PETSC_TRUE; #if defined(PETSC_USE_DEBUG) ierr = PetscOptionsGetBool(NULL,"-malloc",&flg1,&flg2);CHKERRQ(ierr); if ((!flg2 || flg1) && !petscsetmallocvisited) { if (flg2 || !(PETSC_RUNNING_ON_VALGRIND)) { /* turn off default -malloc if valgrind is being used */ ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr); } } #else ierr = PetscOptionsGetBool(NULL,"-malloc_dump",&flg1,NULL);CHKERRQ(ierr); ierr = PetscOptionsGetBool(NULL,"-malloc",&flg2,NULL);CHKERRQ(ierr); if (flg1 || flg2 || flg3) {ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);} #endif if (flg3) { ierr = PetscMallocSetDumpLogThreshold((PetscLogDouble)logthreshold);CHKERRQ(ierr); } flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-malloc_debug",&flg1,NULL);CHKERRQ(ierr); if (flg1) { ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr); ierr = PetscMallocDebug(PETSC_TRUE);CHKERRQ(ierr); } flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-malloc_test",&flg1,NULL);CHKERRQ(ierr); #if defined(PETSC_USE_DEBUG) if (flg1 && !PETSC_RUNNING_ON_VALGRIND) { ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr); ierr = PetscMallocSetDumpLog();CHKERRQ(ierr); ierr = PetscMallocDebug(PETSC_TRUE);CHKERRQ(ierr); } #endif flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-malloc_info",&flg1,NULL);CHKERRQ(ierr); if (!flg1) { flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-memory_view",&flg1,NULL);CHKERRQ(ierr); } if (flg1) { ierr = PetscMemorySetGetMaximumUsage();CHKERRQ(ierr); } #endif #if defined(PETSC_USE_LOG) ierr = PetscOptionsHasName(NULL,"-objects_dump",&PetscObjectsLog);CHKERRQ(ierr); #endif /* Set the display variable for graphics */ ierr = PetscSetDisplay();CHKERRQ(ierr); /* Print the PETSc version information */ ierr = PetscOptionsHasName(NULL,"-v",&flg1);CHKERRQ(ierr); ierr = PetscOptionsHasName(NULL,"-version",&flg2);CHKERRQ(ierr); ierr = PetscOptionsHasName(NULL,"-help",&flg3);CHKERRQ(ierr); if (flg1 || flg2 || flg3) { /* Print "higher-level" package version message */ if (PetscExternalVersionFunction) { ierr = (*PetscExternalVersionFunction)(comm);CHKERRQ(ierr); } ierr = PetscGetVersion(version,256);CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"--------------------------------------------\ ------------------------------\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"%s\n",version);CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"%s",PETSC_AUTHOR_INFO);CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"See docs/changes/index.html for recent updates.\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"See docs/faq.html for problems.\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"See docs/manualpages/index.html for help. \n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"Libraries linked from %s\n",PETSC_LIB_DIR);CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"--------------------------------------------\ ------------------------------\n");CHKERRQ(ierr); } /* Print "higher-level" package help message */ if (flg3) { if (PetscExternalHelpFunction) { ierr = (*PetscExternalHelpFunction)(comm);CHKERRQ(ierr); } } /* Setup the error handling */ flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-on_error_abort",&flg1,NULL);CHKERRQ(ierr); if (flg1) { ierr = MPI_Comm_set_errhandler(PETSC_COMM_WORLD,MPI_ERRORS_ARE_FATAL);CHKERRQ(ierr); ierr = PetscPushErrorHandler(PetscAbortErrorHandler,0);CHKERRQ(ierr); } flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-on_error_mpiabort",&flg1,NULL);CHKERRQ(ierr); if (flg1) { ierr = PetscPushErrorHandler(PetscMPIAbortErrorHandler,0);CHKERRQ(ierr);} flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-mpi_return_on_error",&flg1,NULL);CHKERRQ(ierr); if (flg1) { ierr = MPI_Comm_set_errhandler(comm,MPI_ERRORS_RETURN);CHKERRQ(ierr); } flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-no_signal_handler",&flg1,NULL);CHKERRQ(ierr); if (!flg1) {ierr = PetscPushSignalHandler(PetscSignalHandlerDefault,(void*)0);CHKERRQ(ierr);} flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-fp_trap",&flg1,NULL);CHKERRQ(ierr); if (flg1) {ierr = PetscSetFPTrap(PETSC_FP_TRAP_ON);CHKERRQ(ierr);} ierr = PetscOptionsGetInt(NULL,"-check_pointer_intensity",&intensity,&flag);CHKERRQ(ierr); if (flag) {ierr = PetscCheckPointerSetIntensity(intensity);CHKERRQ(ierr);} /* Setup debugger information */ ierr = PetscSetDefaultDebugger();CHKERRQ(ierr); ierr = PetscOptionsGetString(NULL,"-on_error_attach_debugger",string,64,&flg1);CHKERRQ(ierr); if (flg1) { MPI_Errhandler err_handler; ierr = PetscSetDebuggerFromString(string);CHKERRQ(ierr); ierr = MPI_Comm_create_errhandler((MPI_Handler_function*)Petsc_MPI_DebuggerOnError,&err_handler);CHKERRQ(ierr); ierr = MPI_Comm_set_errhandler(comm,err_handler);CHKERRQ(ierr); ierr = PetscPushErrorHandler(PetscAttachDebuggerErrorHandler,0);CHKERRQ(ierr); } ierr = PetscOptionsGetString(NULL,"-debug_terminal",string,64,&flg1);CHKERRQ(ierr); if (flg1) { ierr = PetscSetDebugTerminal(string);CHKERRQ(ierr); } ierr = PetscOptionsGetString(NULL,"-start_in_debugger",string,64,&flg1);CHKERRQ(ierr); ierr = PetscOptionsGetString(NULL,"-stop_for_debugger",string,64,&flg2);CHKERRQ(ierr); if (flg1 || flg2) { PetscMPIInt size; PetscInt lsize,*nodes; MPI_Errhandler err_handler; /* we have to make sure that all processors have opened connections to all other processors, otherwise once the debugger has stated it is likely to receive a SIGUSR1 and kill the program. */ ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); if (size > 2) { PetscMPIInt dummy = 0; MPI_Status status; for (i=0; i<size; i++) { if (rank != i) { ierr = MPI_Send(&dummy,1,MPI_INT,i,109,PETSC_COMM_WORLD);CHKERRQ(ierr); } } for (i=0; i<size; i++) { if (rank != i) { ierr = MPI_Recv(&dummy,1,MPI_INT,i,109,PETSC_COMM_WORLD,&status);CHKERRQ(ierr); } } } /* check if this processor node should be in debugger */ ierr = PetscMalloc1(size,&nodes);CHKERRQ(ierr); lsize = size; ierr = PetscOptionsGetIntArray(NULL,"-debugger_nodes",nodes,&lsize,&flag);CHKERRQ(ierr); if (flag) { for (i=0; i<lsize; i++) { if (nodes[i] == rank) { flag = PETSC_FALSE; break; } } } if (!flag) { ierr = PetscSetDebuggerFromString(string);CHKERRQ(ierr); ierr = PetscPushErrorHandler(PetscAbortErrorHandler,0);CHKERRQ(ierr); if (flg1) { ierr = PetscAttachDebugger();CHKERRQ(ierr); } else { ierr = PetscStopForDebugger();CHKERRQ(ierr); } ierr = MPI_Comm_create_errhandler((MPI_Handler_function*)Petsc_MPI_AbortOnError,&err_handler);CHKERRQ(ierr); ierr = MPI_Comm_set_errhandler(comm,err_handler);CHKERRQ(ierr); } ierr = PetscFree(nodes);CHKERRQ(ierr); } ierr = PetscOptionsGetString(NULL,"-on_error_emacs",emacsmachinename,128,&flg1);CHKERRQ(ierr); if (flg1 && !rank) {ierr = PetscPushErrorHandler(PetscEmacsClientErrorHandler,emacsmachinename);CHKERRQ(ierr);} /* Setup profiling and logging */ #if defined(PETSC_USE_INFO) { char logname[PETSC_MAX_PATH_LEN]; logname[0] = 0; ierr = PetscOptionsGetString(NULL,"-info",logname,250,&flg1);CHKERRQ(ierr); if (flg1 && logname[0]) { ierr = PetscInfoAllow(PETSC_TRUE,logname);CHKERRQ(ierr); } else if (flg1) { ierr = PetscInfoAllow(PETSC_TRUE,NULL);CHKERRQ(ierr); } } #endif #if defined(PETSC_USE_LOG) mname[0] = 0; ierr = PetscOptionsGetString(NULL,"-history",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr); if (flg1) { if (mname[0]) { ierr = PetscOpenHistoryFile(mname,&petsc_history);CHKERRQ(ierr); } else { ierr = PetscOpenHistoryFile(NULL,&petsc_history);CHKERRQ(ierr); } } #if defined(PETSC_HAVE_MPE) flg1 = PETSC_FALSE; ierr = PetscOptionsHasName(NULL,"-log_mpe",&flg1);CHKERRQ(ierr); if (flg1) {ierr = PetscLogMPEBegin();CHKERRQ(ierr);} #endif flg1 = PETSC_FALSE; flg3 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-log_all",&flg1,NULL);CHKERRQ(ierr); ierr = PetscOptionsHasName(NULL,"-log_summary",&flg3);CHKERRQ(ierr); if (flg1) { ierr = PetscLogAllBegin();CHKERRQ(ierr); } else if (flg3) { ierr = PetscLogDefaultBegin();CHKERRQ(ierr);} ierr = PetscOptionsGetString(NULL,"-log_trace",mname,250,&flg1);CHKERRQ(ierr); if (flg1) { char name[PETSC_MAX_PATH_LEN],fname[PETSC_MAX_PATH_LEN]; FILE *file; if (mname[0]) { sprintf(name,"%s.%d",mname,rank); ierr = PetscFixFilename(name,fname);CHKERRQ(ierr); file = fopen(fname,"w"); if (!file) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_FILE_OPEN,"Unable to open trace file: %s",fname); } else file = PETSC_STDOUT; ierr = PetscLogTraceBegin(file);CHKERRQ(ierr); } ierr = PetscOptionsGetViewer(PETSC_COMM_WORLD,NULL,"-log_view",NULL,&format,&flg4);CHKERRQ(ierr); if (flg4) { if (format == PETSC_VIEWER_ASCII_XML){ ierr = PetscLogNestedBegin();CHKERRQ(ierr); } else { ierr = PetscLogDefaultBegin();CHKERRQ(ierr); } } #endif ierr = PetscOptionsGetBool(NULL,"-saws_options",&PetscOptionsPublish,NULL);CHKERRQ(ierr); #if defined(PETSC_HAVE_CUDA) ierr = PetscOptionsHasName(NULL,"-cuda_show_devices",&flg1);CHKERRQ(ierr); if (flg1) { struct cudaDeviceProp prop; int devCount; int device; cudaError_t err = cudaSuccess; err = cudaGetDeviceCount(&devCount); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceCount %s",cudaGetErrorString(err)); for (device = 0; device < devCount; ++device) { err = cudaGetDeviceProperties(&prop, device); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceProperties %s",cudaGetErrorString(err)); ierr = PetscPrintf(PETSC_COMM_WORLD, "CUDA device %d: %s\n", device, prop.name);CHKERRQ(ierr); } } { int size; ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); if (size>1) { int devCount, device, rank; cudaError_t err = cudaSuccess; /* check to see if we force multiple ranks to hit the same GPU */ ierr = PetscOptionsGetInt(NULL,"-cuda_set_device", &device, &flg1);CHKERRQ(ierr); if (flg1) { err = cudaSetDevice(device); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err)); } else { /* we're not using the same GPU on multiple MPI threads. So try to allocated different GPUs to different processes */ /* First get the device count */ err = cudaGetDeviceCount(&devCount); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceCount %s",cudaGetErrorString(err)); /* next determine the rank and then set the device via a mod */ ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr); device = rank % devCount; err = cudaSetDevice(device); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err)); } /* set the device flags so that it can map host memory ... do NOT throw exception on err!=cudaSuccess multiple devices may try to set the flags on the same device. So long as one of them succeeds, things are ok. */ err = cudaSetDeviceFlags(cudaDeviceMapHost); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDeviceFlags %s",cudaGetErrorString(err)); } else { int device; cudaError_t err = cudaSuccess; /* the code below works for serial GPU simulations */ ierr = PetscOptionsGetInt(NULL,"-cuda_set_device", &device, &flg1);CHKERRQ(ierr); if (flg1) { err = cudaSetDevice(device); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err)); } /* set the device flags so that it can map host memory ... here, we error check. */ err = cudaSetDeviceFlags(cudaDeviceMapHost); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDeviceFlags %s",cudaGetErrorString(err)); } } #endif /* Print basic help message */ ierr = PetscOptionsHasName(NULL,"-help",&flg1);CHKERRQ(ierr); if (flg1) { ierr = (*PetscHelpPrintf)(comm,"Options for all PETSc programs:\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -help: prints help method for each option\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -on_error_abort: cause an abort when an error is detected. Useful \n ");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," only when run in the debugger\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -on_error_attach_debugger [gdb,dbx,xxgdb,ups,noxterm]\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," start the debugger in new xterm\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," unless noxterm is given\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -start_in_debugger [gdb,dbx,xxgdb,ups,noxterm]\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," start all processes in the debugger\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -on_error_emacs <machinename>\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," emacs jumps to error file\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -debugger_nodes [n1,n2,..] Nodes to start in debugger\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -debugger_pause [m] : delay (in seconds) to attach debugger\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -stop_for_debugger : prints message on how to attach debugger manually\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," waits the delay for you to attach\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -display display: Location where X window graphics and debuggers are displayed\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -no_signal_handler: do not trap error signals\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -mpi_return_on_error: MPI returns error code, rather than abort on internal error\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -fp_trap: stop on floating point exceptions\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," note on IBM RS6000 this slows run greatly\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc_dump <optional filename>: dump list of unfreed memory at conclusion\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc: use our error checking malloc\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc no: don't use error checking malloc\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc_info: prints total memory usage\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc_log: keeps log of all memory allocations\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc_debug: enables extended checking for memory corruption\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -options_table: dump list of options inputted\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -options_left: dump list of unused options\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -options_left no: don't dump list of unused options\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -tmp tmpdir: alternative /tmp directory\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -shared_tmp: tmp directory is shared by all processors\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -not_shared_tmp: each processor has separate tmp directory\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -memory_view: print memory usage at end of run\n");CHKERRQ(ierr); #if defined(PETSC_USE_LOG) ierr = (*PetscHelpPrintf)(comm," -get_total_flops: total flops over all processors\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -log[_summary _summary_python]: logging objects and events\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -log_trace [filename]: prints trace of all PETSc calls\n");CHKERRQ(ierr); #if defined(PETSC_HAVE_MPE) ierr = (*PetscHelpPrintf)(comm," -log_mpe: Also create logfile viewable through Jumpshot\n");CHKERRQ(ierr); #endif ierr = (*PetscHelpPrintf)(comm," -info <optional filename>: print informative messages about the calculations\n");CHKERRQ(ierr); #endif ierr = (*PetscHelpPrintf)(comm," -v: prints PETSc version number and release date\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -options_file <file>: reads options from file\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -petsc_sleep n: sleeps n seconds before running program\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"-----------------------------------------------\n");CHKERRQ(ierr); } #if defined(PETSC_HAVE_POPEN) { char machine[128]; ierr = PetscOptionsGetString(NULL,"-popen_machine",machine,128,&flg1);CHKERRQ(ierr); if (flg1) { ierr = PetscPOpenSetMachine(machine);CHKERRQ(ierr); } } #endif ierr = PetscOptionsGetReal(NULL,"-petsc_sleep",&si,&flg1);CHKERRQ(ierr); if (flg1) { ierr = PetscSleep(si);CHKERRQ(ierr); } ierr = PetscOptionsGetString(NULL,"-info_exclude",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr); if (flg1) { ierr = PetscStrstr(mname,"null",&f);CHKERRQ(ierr); if (f) { ierr = PetscInfoDeactivateClass(0);CHKERRQ(ierr); } } #if defined(PETSC_HAVE_CUSP) || defined(PETSC_HAVE_VIENNACL) ierr = PetscOptionsHasName(NULL,"-log_summary",&flg3);CHKERRQ(ierr); if (!flg3) { ierr = PetscOptionsHasName(NULL,"-log_view",&flg3);CHKERRQ(ierr); } #endif #if defined(PETSC_HAVE_CUSP) ierr = PetscOptionsGetBool(NULL,"-cusp_synchronize",&flg3,NULL);CHKERRQ(ierr); PetscCUSPSynchronize = flg3; #elif defined(PETSC_HAVE_VIENNACL) ierr = PetscOptionsGetBool(NULL,"-viennacl_synchronize",&flg3,NULL);CHKERRQ(ierr); PetscViennaCLSynchronize = flg3; #endif PetscFunctionReturn(0); }
/** * This function does what it says on the tin. */ void OptiXRenderer::performRender(long long int photons, int argc_mpi, char* argv_mpi[], int width, int height, float film_location) { // Keep track of time timeval tic; // Create OptiX context optix::Context context = optix::Context::create(); context->setRayTypeCount( 1 ); // Debug, this will make everything SLOOOOOW context->setPrintEnabled(false); // Set some CUDA flags cudaSetDeviceFlags(cudaDeviceMapHost | cudaDeviceLmemResizeToMax); // Set used devices int tmp[] = { 0, 1 }; std::vector<int> v( tmp, tmp+2 ); context->setDevices(v.begin(), v.end()); // Report device usage int num_devices = context->getEnabledDeviceCount(); printf("Using %d devices:\n", num_devices); std::vector<int> enabled_devices = context->getEnabledDevices(); for(int i=0;i<num_devices;i++) { printf(" Device #%d [%s]\n", enabled_devices[i], context->getDeviceName(enabled_devices[i]).c_str()); } // Set some OptiX variables context->setStackSize(4096); // Report OptiX infomation int stack_size_in_bytes = context->getStackSize(); printf("Optix stack size is %d bytes (~%d KB).\n.", stack_size_in_bytes, stack_size_in_bytes/1024); // Declare some variables int threads = 500000; //20000000; unsigned int iterations_on_device = 1; // Set some scene-wide variables context["photon_ray_type"]->setUint( 0u ); context["scene_bounce_limit"]->setUint( 10u ); context["scene_epsilon"]->setFloat( 1.e-4f ); context["iterations"]->setUint(iterations_on_device); context["follow_photon"]->setInt(66752); // Convert our existing scene into an OptiX one convertToOptiXScene(context, width, height, film_location); // Report infomation printf("Rendering with:\n"); printf(" %lld photons.\n", photons); printf(" %d threads.\n", threads); printf(" %d iterations per thread.\n", iterations_on_device); int launches = (photons/threads)/iterations_on_device; if(launches*threads*iterations_on_device<photons) { launches++; printf(" NOTE: You have asked for %lld photons, we are providing %lld photons instead.\n", photons, launches*threads*iterations_on_device); } printf(" %d optix launches.\n", launches); // Create buffer for random numbers optix::Buffer random_buffer = context->createBufferForCUDA( RT_BUFFER_INPUT_OUTPUT | RT_BUFFER_GPU_LOCAL, RT_FORMAT_USER, threads ); random_buffer->setElementSize(sizeof(curandState)); curandState* states_ptr[num_devices]; // Intalise for(int i=0;i<num_devices;i++) { int device_id = enabled_devices[i]; long memory_in_bytes = threads * sizeof(curandState); long memory_in_megabytes = memory_in_bytes/(1024*1024); printf("Allocating %ld bytes (~%ld MB) of memory on device #%d for random states...\n", memory_in_bytes, memory_in_megabytes, device_id); gettimeofday(&tic, NULL); cudaSetDevice(device_id); cudaMalloc((void **)&states_ptr[i], memory_in_bytes); done(tic); CUDAWrapper executer; executer.curand_setup(threads, (void **)&states_ptr[i], time(NULL), i); } // Set as buffer on context context["states"]->set(random_buffer); // Wait printf("Waiting for random states to initalise...\n"); gettimeofday(&tic, NULL); for(int i=0;i<num_devices;i++) { cudaSetDevice(enabled_devices[i]); sync_all_threads(); } done(tic); // Bind to the OptiX buffer // We do this here because it cases a syncronise apparently for(int i=0;i<num_devices;i++) { random_buffer->setDevicePointer(enabled_devices[i], (CUdeviceptr) states_ptr[i]); } // Create Image buffer optix::Buffer buffer = context->createBufferForCUDA( RT_BUFFER_INPUT_OUTPUT | RT_BUFFER_GPU_LOCAL, RT_FORMAT_FLOAT4, width, height ); optix::float4* imgs_ptr[num_devices]; //cudaSetDevice(0); //cudaMalloc((void **)&states_ptr_0, threads * sizeof(curandState)); for(int i=0;i<num_devices;i++) { int device_id = enabled_devices[i]; long memory_in_bytes = width * height * sizeof(optix::float4); long memory_in_megabytes = memory_in_bytes/(1024*1024); printf("Allocating %ld bytes (~%ld MB) of memory on device #%d for image result...\n", memory_in_bytes, memory_in_megabytes, device_id); gettimeofday(&tic, NULL); cudaSetDevice(device_id); cudaMalloc((void **)&imgs_ptr[i], memory_in_bytes); done(tic); CUDAWrapper executer; executer.img_setup((void **)&imgs_ptr[i], width, height); } // Set as buffer on context context["output_buffer"]->set(buffer); // Wait for everytyhing to execute printf("Waiting for Image data to initalise...\n"); gettimeofday(&tic, NULL); for(int i=0;i<num_devices;i++) { cudaSetDevice(enabled_devices[i]); sync_all_threads(); } done(tic); // Bind to the OptiX buffer // We do this here because it cases a syncronise apparently for(int i=0;i<num_devices;i++) { buffer->setDevicePointer(enabled_devices[i], (CUdeviceptr) imgs_ptr[i]); } // Construct MPI int size, rank = 0; #ifndef PHOTON_MPI (void)size; (void)argc_mpi; (void)argv_mpi; #endif #ifdef PHOTON_MPI MPI::Init( argc_mpi, argv_mpi ); //MPI_Get_processor_name(hostname,&strlen); rank = MPI::COMM_WORLD.Get_rank(); size = MPI::COMM_WORLD.Get_size(); printf("Hello, world; from process %d of %d\n", rank, size); // Adjust number of photons for MPI long long int long_size = (long long int) size; photons = photons/long_size; if(rank==0) printf("MPI adjusted to %lld photons per thread", photons); #endif /* MPI */ // Validate try{ context->validate(); }catch(Exception& e){ printf("Validate error!\n"); printf(" CUDA says : %s\n", cudaGetErrorString(cudaPeekAtLastError())); printf(" OptiX says : %s\n", e.getErrorString().c_str() ); return; } // Compile context try{ context->compile(); }catch(Exception& e){ printf("Compile error!\n"); printf(" CUDA says : %s\n", cudaGetErrorString(cudaPeekAtLastError())); printf(" OptiX says : %s\n", e.getErrorString().c_str() ); return; } // Render int current_launch = 0; try{ printf("Begin render...\n"); gettimeofday(&tic, NULL); for(current_launch=0;current_launch<launches;current_launch++) { printf(" ... %f percent\n", 100*((current_launch*1.0f*threads)/photons)); context->launch(0 , threads ); } done(tic); }catch(Exception& e){ printf("Launch error on launch #%d!\n", current_launch); printf(" CUDA says : %s\n", cudaGetErrorString(cudaPeekAtLastError())); printf(" OptiX says : %s\n", e.getErrorString().c_str() ); return; } #ifndef PHOTON_MPI #endif /* If not MPI */ #ifdef PHOTON_MPI // Create MPI handles accImg = new Image(img->getWidth(), img->getHeight()); MPI::Win window_r; MPI::Win window_g; MPI::Win window_b; // Construct an MPI Window to copy some data into, one for each colour. int size_in_bytes = sizeof(float)*img->getWidth()*img->getHeight(); window_r = MPI::Win::Create(accImg->imageR, size_in_bytes, sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD); window_g = MPI::Win::Create(accImg->imageG, size_in_bytes, sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD); window_b = MPI::Win::Create(accImg->imageB, size_in_bytes, sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD); // Perform transfer window_r.Fence(0); window_g.Fence(0); window_b.Fence(0); window_r.Accumulate( img->imageR, img->getWidth()*img->getHeight(), MPI_FLOAT, 0, 0, img->getWidth()*img->getHeight(), MPI_FLOAT, MPI_SUM ); window_g.Accumulate( img->imageG, img->getWidth()*img->getHeight(), MPI_FLOAT, 0, 0, img->getWidth()*img->getHeight(), MPI_FLOAT, MPI_SUM ); window_b.Accumulate( img->imageB, img->getWidth()*img->getHeight(), MPI_FLOAT, 0, 0, img->getWidth()*img->getHeight(), MPI_FLOAT, MPI_SUM ); window_r.Fence(0); window_g.Fence(0); window_b.Fence(0); window_r.Free(); #endif /* MPI */ // Output the image if(rank==0) { // Construct filename char sbuffer[100]; sprintf(sbuffer, "photons-%d.ppm", 0); // This is the collected image data on the host optix::float4* img_host_ptr = (optix::float4*) malloc(width*height*sizeof(optix::float4)); // If we have more than one device we have to accumulate everything back into one buffer if(num_devices == 1) { img_host_ptr = (optix::float4*) malloc(width*height*sizeof(optix::float4)); cudaMemcpy(img_host_ptr, imgs_ptr[0], width*height*sizeof(optix::float4), cudaMemcpyDeviceToHost); } else { printf("We are using %d GPUs, accumulating result...", num_devices); gettimeofday(&tic, NULL); // Create an accumulate buffer on GPU #0# // int device_id = enabled_devices[0]; // cudaSetDevice(device_id); // optix::float4* accumulate_dev_ptr; // cudaMalloc((void **)&accumulate_dev_ptr, width*height*sizeof(optix::float4)); // // put the array of memory ptrs on device 0 // optix::float4** ptrs_dev_ptr; // cudaMalloc((void **)&ptrs_dev_ptr, num_devices*sizeof(optix::float4*)); // // Copy data over // cudaMemcpy(ptrs_dev_ptr, imgs_ptr, num_devices*sizeof(optix::float4*), cudaMemcpyHostToDevice); // CUDAWrapper executer; // executer.img_accumulate((void ***)&ptrs_dev_ptr, (void **)&accumulate_dev_ptr, num_devices, width, height); // cudaMemcpy(img_host_ptr, accumulate_dev_ptr, width*height*sizeof(optix::float4), cudaMemcpyDeviceToHost); // Copy everything to host and accumulate here optix::float4* host_buffers[num_devices]; for(int i=0;i<num_devices;i++) { host_buffers[i] = (optix::float4*) malloc(width*height*sizeof(optix::float4)); cudaMemcpy(host_buffers[i], imgs_ptr[i], width*height*sizeof(optix::float4), cudaMemcpyDeviceToHost); } // Acumulate for(int i=0;i<width*height;i++) { img_host_ptr[i] = make_float4(0, 0, 0, 0); for(int j=0;j<num_devices;j++) { img_host_ptr[i].x += host_buffers[j][i].x; img_host_ptr[i].y += host_buffers[j][i].y; img_host_ptr[i].z += host_buffers[j][i].z; img_host_ptr[i].w += host_buffers[j][i].w; } } for(int i=0;i<num_devices;i++) { free(host_buffers[i]); } done(tic); } printf("Saving Image to %s...\n", sbuffer); gettimeofday(&tic, NULL); saveToPPMFile(sbuffer, img_host_ptr, width, height); free(img_host_ptr); done(tic); } #ifdef PHOTON_MPI // Teardown MPI MPI::Finalize(); #endif /* MPI */ }
int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { int dev_id = device_map[thr_id]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; const int swap = 1; // to toggle nonce endian uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 256*256*8 if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0; if (!init[thr_id]) { cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); // reduce cpu usage cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); CUDA_LOG_ERROR(); } cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput); quark_skein512_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput); CUDA_SAFE_CALL(cudaDeviceSynchronize()); init[thr_id] = true; } uint32_t endiandata[20]; for (int k=0; k < 19; k++) be32enc(&endiandata[k], pdata[k]); skein512_cpu_setBlock_80((void*)endiandata); cuda_check_cpu_setTarget(ptarget); do { int order = 0; // Hash with CUDA skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], swap); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); *hashes_done = pdata[19] - first_nonce + throughput; uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != UINT32_MAX) { uint32_t _ALIGN(64) vhash64[8]; endiandata[19] = swab32_if(foundNonce, swap); skein2hash(vhash64, endiandata); if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { int res = 1; uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); work_set_target_ratio(work, vhash64); if (secNonce != 0) { if (!opt_quiet) applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", dev_id, swab32(secNonce)); endiandata[19] = swab32_if(secNonce, swap); skein2hash(vhash64, endiandata); if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio) work_set_target_ratio(work, vhash64); pdata[21] = swab32_if(secNonce, !swap); res++; } pdata[19] = swab32_if(foundNonce, !swap); return res; } else { gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); } } if ((uint64_t) throughput + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } pdata[19] += throughput; } while (!work_restart[thr_id].restart); *hashes_done = pdata[19] - first_nonce; return 0; }
extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata, uint32_t *ptarget, uint32_t max_nonce, uint32_t *hashes_done) { static THREAD uint32_t *foundNounce = nullptr; uint32_t start_nonce = pdata[19]; unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 24 : 23; uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << intensity); uint32_t throughput = min(throughputmax, max_nonce - start_nonce) & 0xfffffc00; if (opt_benchmark) ptarget[7] = 0x0000000f; // init static THREAD volatile bool init = false; if(!init) { CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); groestlcoin_cpu_init(thr_id, throughputmax); CUDA_SAFE_CALL(cudaMallocHost(&foundNounce, 2 * 4)); init = true; } // Endian Drehung ist notwendig uint32_t endiandata[32]; for (int kk=0; kk < 32; kk++) be32enc(&endiandata[kk], pdata[kk]); // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt) groestlcoin_cpu_setBlock(thr_id, endiandata); do { // GPU const uint32_t Htarg = ptarget[7]; groestlcoin_cpu_hash(thr_id, throughput, pdata[19], foundNounce, ptarget[7]); if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} if(foundNounce[0] < 0xffffffff) { uint32_t tmpHash[8]; endiandata[19] = SWAP32(foundNounce[0]); groestlhash(tmpHash, endiandata); if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget)) { int res = 1; if(opt_benchmark) applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], foundNounce[0]); *hashes_done = pdata[19] - start_nonce + throughput; if(foundNounce[1] != 0xffffffff) { endiandata[19] = SWAP32(foundNounce[1]); groestlhash(tmpHash, endiandata); if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget)) { pdata[21] = foundNounce[1]; res++; if(opt_benchmark) applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], foundNounce[1]); } else { if(tmpHash[7] != Htarg) { applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[1]); } } } pdata[19] = foundNounce[0]; return res; } else { if(tmpHash[7] != Htarg) { applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[0]); } } } pdata[19] += throughput; cudaError_t err = cudaGetLastError(); if(err != cudaSuccess) { applog(LOG_ERR, "GPU #%d: %s", device_map[thr_id], cudaGetErrorString(err)); exit(EXIT_FAILURE); } } while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); *hashes_done = pdata[19] - start_nonce; return 0; }
/** * @ingroup virginian * @brief Initializes the Virginian database * * Initializes or re-initializes the struct that holds the * state of the database. It sets a number of options via defaults hard-coded or * defined in virginian.h. Additionally, this function is responsible for * allocating the tablet memory areas in both main memory and GPU memory. If the * VIRG_DEBUG macro is defined, both the state struct and the tablet memory * areas are set to 0xDEADBEEF. Finally, this function also initializes the CUDA * context. The allocations made in this function are freed with virg_close(). * * @param v Pointer to the state struct of the database system * @return VIRG_SUCCESS or VIRG_FAIL depending on errors during the function * call */ int virg_init(virginian *v) { int i; cudaError_t r; #ifdef VIRG_DEBUG // zero out db struct for valgrind memset(&v->db, 0xDEADBEEF, sizeof(virg_db)); #endif // set struct defaults v->tablet_slot_counter = 0; v->tablet_slots_taken = 0; v->threads_per_block = VIRG_THREADSPERBLOCK; v->multi_threads = VIRG_MULTITHREADS; v->use_multi = 0; v->use_gpu = 0; v->use_stream = 0; v->use_mmap = 0; v->dbfd = -1; // init mutex for locking tablet slots VIRG_CHECK(pthread_mutex_init(&v->slot_lock, NULL), "Could not init mutex") // set proper flags, first one enables mapped memory //cudaSetDeviceFlags(cudaDeviceMapHost | cudaDeviceScheduleSpin | cudaDeviceScheduleBlockingSync); cudaSetDeviceFlags(cudaDeviceMapHost); cudaSetDeviceFlags(cudaDeviceScheduleSpin); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); VIRG_CUDCHK("set device flags"); // initialize cuda context cudaSetDevice(VIRG_CUDADEVICE); VIRG_CUDCHK("set device"); // initialize tablets for(i = 0; i < VIRG_MEM_TABLETS; i++) { v->tablet_slot_status[i] = 0; // if pinned, then use cuda alloc #ifndef VIRG_NOPINNED r = cudaHostAlloc((void**)&v->tablet_slots[i], VIRG_TABLET_SIZE, cudaHostAllocMapped); VIRG_CUDCHK("Allocating pinned tablet memory"); #else v->tablet_slots[i] = malloc(VIRG_TABLET_SIZE); VIRG_CHECK(v->tablet_slots[i] == NULL, "Problem allocating tablet memory"); #endif #ifdef VIRG_DEBUG // this should only be read when status is nonzero v->tablet_slot_ids[i] = 0; // zero out memory for debugging memset(v->tablet_slots[i], 0xDEADBEEF, VIRG_TABLET_SIZE); #endif } if(VIRG_GPU_TABLETS > 0) { // initialize gpu tablets as a single block r = cudaMalloc((void**)&v->gpu_slots, VIRG_TABLET_SIZE * VIRG_GPU_TABLETS); VIRG_CHECK(r != cudaSuccess, "Problem allocating GPU tablet memory"); #ifdef VIRG_DEBUG cudaMemset(v->gpu_slots, 0xDEADBEEF, VIRG_TABLET_SIZE * VIRG_GPU_TABLETS); #endif } return VIRG_SUCCESS; }
cudaError_t WINAPI wine_cudaSetDeviceFlags( int flags ) { WINE_TRACE("\n"); return cudaSetDeviceFlags( flags ); }
// This is the GPU thread where we do the per-GPU tasks. void GRTRegenerateChains::GPU_Thread(void *pointer) { struct GRTRegenerateThreadRunData *data; GRTWorkunitElement *WU; data = (GRTRegenerateThreadRunData *) pointer; // Set the device. cudaSetDevice(data->gpuDeviceId); // Enable blocking sync. This dramatically reduces CPU usage. // If zero copy is being used, set DeviceMapHost as well if (this->CommandLineData->GetUseZeroCopy()) { cudaSetDeviceFlags(cudaDeviceBlockingSync | cudaDeviceMapHost); } else { cudaSetDeviceFlags(cudaDeviceBlockingSync); } //printf("Copying to GPU mem\n"); this->AllocatePerGPUMemory(data); this->copyDataToConstant(data); // printf("Back from copy constant\n"); cudaThreadSynchronize(); // I... *think* we're ready to rock! // As long as we aren't supposed to exit, keep running. while (1) { WU = this->Workunit->GetNextWorkunit(); if (WU == NULL) { if (!silent) { if (this->Display) { sprintf(this->statusStrings, "Thread %d out of WU", data->threadID); this->Display->addStatusLine(this->statusStrings); } else { printf("Thread %d out of workunits\n", data->threadID); } } break; } if (this->HashFile->GetUncrackedHashCount() == 0) { if (!silent) { if (this->Display) { sprintf(this->statusStrings, "All hashes found"); this->Display->addStatusLine(this->statusStrings); } else { printf("Thread %d no unfound hashes left!\n", data->threadID); } } break; } this->RunGPUWorkunit(WU, data); this->Workunit->SubmitWorkunit(WU); if (this->Display) { this->Display->setWorkunitsCompleted(this->Workunit->GetNumberOfCompletedWorkunits()); this->Display->setThreadFractionDone(data->threadID, 0.0); } } if (this->Display) { this->Display->setThreadCrackSpeed(data->threadID, 0, 0.0); } this->FreePerGPUMemory(data); // Clean up thread context for subsequent setting of thread ID cudaThreadExit(); }