pbf_dam_sim::pbf_dam_sim(scalar_t space) { init_cond = make_shared<pbf_dam_init_cond>(space); init_cond->getParameter(simulatee.parameter); init_cond->getExternalForce(simulatee.external); auto domain = init_cond->getDomainRange(); const auto num_capacity = 40000; simulatee.allocate(num_capacity, domain.second); buffer.allocate(num_capacity, simulatee.ns->getMaxPairParticleNum()); vector<glm::vec3> x; vector<glm::vec3> v; init_cond->getDomainParticlePhaseHost(x, v); simulatee.phase.num = x.size(); cout << "initial particle number: " << x.size() << endl; #pragma region gl_interpo_init swVBOs::getInstance().enroll("pbf_particle"); glBindBuffer(GL_ARRAY_BUFFER, swVBOs::getInstance().findVBO("pbf_particle")); glBufferData(GL_ARRAY_BUFFER, x.size() * sizeof(glm::vec3), x.data(), GL_STATIC_DRAW); gpuErrchk(cudaDeviceSynchronize()); cudaGraphicsGLRegisterBuffer(&cu_res, swVBOs::getInstance().findVBO("pbf_particle"), cudaGraphicsMapFlagsNone); #ifndef NDEBUG gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); #endif // modify particles data cudaFree(simulatee.phase.x); cudaMemcpy(simulatee.phase.v, v.data(), v.size() * sizeof(dom_dim), cudaMemcpyHostToDevice); #pragma endregion }
void oskar_device_check_error(int* status) { if (*status) return; #ifdef OSKAR_HAVE_CUDA *status = (int) cudaPeekAtLastError(); #endif }
void pbf_dam_sim::simulateOneStep() { //simulatee.external.body_force cudaGraphicsMapResources(1, &cu_res); #ifndef NDEBUG gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); #endif size_t pos_size; cudaGraphicsResourceGetMappedPointer((void**)&simulatee.phase.x, &pos_size, cu_res); #ifndef NDEBUG gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); #endif one_step(simulatee, buffer, domain, 3); cudaGraphicsUnmapResources(1, &cu_res); simulatee.phase.x = NULL; }
int main() { // Initialize variables double *h_u; h_u = (double*)malloc(sizeof(double)*(NX*NY*NZ)); // Set Domain Initial Condition and BCs Call_IC(h_u); // GPU Memory Arrays double *d_u; checkCuda(cudaMalloc((void**)&d_u, sizeof(double)*(NX*NY))); double *d_un; checkCuda(cudaMalloc((void**)&d_un,sizeof(double)*(NX*NY))); // Copy Initial Condition from host to device checkCuda(cudaMemcpy(d_u, h_u,sizeof(double)*(NX*NY),cudaMemcpyHostToDevice)); checkCuda(cudaMemcpy(d_un,h_u,sizeof(double)*(NX*NY),cudaMemcpyHostToDevice)); // GPU kernel launch parameters dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z); dim3 dimGrid (DIVIDE_INTO(NX, BLOCK_SIZE_X), DIVIDE_INTO(NY, BLOCK_SIZE_Y), DIVIDE_INTO(NZ, BLOCK_SIZE_Z)); // Request computer current time time_t t = clock(); // Solver Loop for (int step=0; step < NO_STEPS; step+=2) { if (step%10000==0) printf("Step %d of %d\n",step,(int)NO_STEPS); // Compute Laplace Call_Laplace(dimGrid,dimBlock,d_u,d_un); // Call_Laplace_Texture(dimGrid,dimBlock,d_u,d_un); } if (DEBUG) printf("CUDA error (Jacobi_Method) %s\n",cudaGetErrorString(cudaPeekAtLastError())); // Measure and Report computation time t = clock()-t; printf("Computing time (%f seconds).\n",((float)t)/CLOCKS_PER_SEC); // Copy data from device to host checkCuda(cudaMemcpy(h_u,d_u,sizeof(double)*(NX*NY*NZ),cudaMemcpyDeviceToHost)); // uncomment to print solution to terminal if (DEBUG) Print2D(h_u); // Write solution to file Save_Results(h_u); // Free device memory checkCuda(cudaFree(d_u)); checkCuda(cudaFree(d_un)); // Reset device checkCuda(cudaDeviceReset()); // Free memory on host and device free(h_u); return 0; }
void cuda_random(float *x_gpu, size_t n) { static curandGenerator_t gen; static int init = 0; if(!init){ curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(gen, time(0)); init = 1; } curandGenerateUniform(gen, x_gpu, n); check_error(cudaPeekAtLastError()); }
void cuda_random(float *x_gpu, size_t n) { static curandGenerator_t gen[16]; static int init[16] = { 0 }; int i = cuda_get_device(); if (!init[i]) { curandCreateGenerator(&gen[i], CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(gen[i], time(0)); init[i] = 1; } curandGenerateUniform(gen[i], x_gpu, n); check_error(cudaPeekAtLastError()); }
void cudaCheck(const int line, const std::string& function, const std::string& file) { try { #ifdef USE_CUDA const auto errorCode = cudaPeekAtLastError(); if(errorCode != cudaSuccess) error("Cuda check failed (" + std::to_string(errorCode) + " vs. " + std::to_string(cudaSuccess) + "): " + cudaGetErrorString(errorCode), line, function, file); #else UNUSED(line); UNUSED(function); UNUSED(file); error("OpenPose must be compiled with the `USE_CUDA` macro definition in order to use this" " functionality.", __LINE__, __FUNCTION__, __FILE__); #endif } catch (const std::exception& e) { error(e.what(), __LINE__, __FUNCTION__, __FILE__); } }
extern inline void _cudaCheckError(const char *file, const int line) { #ifndef NDEBUG cudaDeviceSynchronize(); #endif cudaError_t ret = cudaPeekAtLastError(); if (ret != cudaSuccess) { fprintf(stderr, "cudaCheckError() failed at %s:%i: %s\n", file, line, cudaGetErrorString(cudaGetLastError())); #ifndef NDEBUG switch (ret) { case cudaErrorMissingConfiguration: fprintf(stderr, "Error type: cudaErrorMissingConfiguration\n"); fprintf(stderr, "Missing configuration error.\n"); break; case cudaErrorMemoryAllocation: fprintf(stderr, "Error type: cudaErrorMemoryAllocation\n"); fprintf(stderr, "Memory allocation error.\n"); break; case cudaErrorInitializationError: fprintf(stderr, "Error type: cudaErrorInitializationError\n"); fprintf(stderr, "Initialization error.\n"); break; case cudaErrorLaunchFailure: fprintf(stderr, "Error type: cudaErrorLaunchFailure\n"); fprintf(stderr, "Launch failure.\n"); break; case cudaErrorPriorLaunchFailure: fprintf(stderr, "Error type: cudaErrorPriorLaunchFailure\n"); fprintf(stderr, "Prior launch failure.\n"); break; case cudaErrorLaunchTimeout: fprintf(stderr, "Error type: cudaErrorLaunchTimeout\n"); fprintf(stderr, "Launch timeout error.\n"); break; case cudaErrorLaunchOutOfResources: fprintf(stderr, "Error type: cudaErrorLaunchOutOfResources\n"); fprintf(stderr, "Launch out of resources error.\n"); break; case cudaErrorInvalidDeviceFunction: fprintf(stderr, "Error type: cudaErrorInvalidDeviceFunction\n"); fprintf(stderr, "Invalid device function.\n"); break; case cudaErrorInvalidConfiguration: fprintf(stderr, "Error type: cudaErrorInvalidConfiguration\n"); fprintf(stderr, "Invalid configuration.\n"); break; case cudaErrorInvalidDevice: fprintf(stderr, "Error type: cudaErrorInvalidDevice\n"); fprintf(stderr, "Invalid device.\n"); break; case cudaErrorInvalidValue: fprintf(stderr, "Error type: cudaErrorInvalidValue\n"); fprintf(stderr, "Invalid value.\n"); break; case cudaErrorInvalidPitchValue: fprintf(stderr, "Error type: cudaErrorInvalidPitchValue\n"); fprintf(stderr, "Invalid pitch value.\n"); break; case cudaErrorInvalidSymbol: fprintf(stderr, "Error type: cudaErrorInvalidSymbol\n"); fprintf(stderr, "Invalid symbol.\n"); break; case cudaErrorMapBufferObjectFailed: fprintf(stderr, "Error type: cudaErrorMapBufferObjectFailed\n"); fprintf(stderr, "Map buffer object failed.\n"); break; case cudaErrorUnmapBufferObjectFailed: fprintf(stderr, "Error type: cudaErrorUnmapBufferObjectFailed\n"); fprintf(stderr, "Unmap buffer object failed.\n"); break; case cudaErrorInvalidHostPointer: fprintf(stderr, "Error type: cudaErrorInvalidHostPointer\n"); fprintf(stderr, "Invalid host pointer.\n"); break; case cudaErrorInvalidDevicePointer: fprintf(stderr, "Error type: cudaErrorInvalidDevicePointer\n"); fprintf(stderr, "Invalid device pointer.\n"); break; case cudaErrorInvalidTexture: fprintf(stderr, "Error type: cudaErrorInvalidTexture\n"); fprintf(stderr, "Invalid device pointer.\n"); break; case cudaErrorInvalidTextureBinding: fprintf(stderr, "Error type: cudaErrorInvalidTextureBinding\n"); fprintf(stderr, "Invalid texture binding.\n"); break; case cudaErrorInvalidChannelDescriptor: fprintf(stderr, "Error type: cudaErrorInvalidChannelDescriptor\n"); fprintf(stderr, "Invalid channel descriptor.\n"); break; case cudaErrorInvalidMemcpyDirection: fprintf(stderr, "Error type: cudaErrorInvalidMemcpyDirection\n"); fprintf(stderr, "Invalid memcpy direction.\n"); break; case cudaErrorAddressOfConstant: fprintf(stderr, "Error type: cudaErrorAddressOfConstant\n"); fprintf(stderr, "Address of constant error.\n"); break; case cudaErrorTextureFetchFailed: fprintf(stderr, "Error type: cudaErrorTextureFetchFailed\n"); fprintf(stderr, "Texture fetch failed.\n"); break; case cudaErrorTextureNotBound: fprintf(stderr, "Error type: cudaErrorTextureNotBound\n"); fprintf(stderr, "Texture not bound error.\n"); break; case cudaErrorSynchronizationError: fprintf(stderr, "Error type: cudaErrorSynchronizationError\n"); fprintf(stderr, "Synchronization error.\n"); break; case cudaErrorInvalidFilterSetting: fprintf(stderr, "Error type: cudaErrorInvalidFilterSetting\n"); fprintf(stderr, "Invalid filter setting.\n"); break; case cudaErrorInvalidNormSetting: fprintf(stderr, "Error type: cudaErrorInvalidNormSetting\n"); fprintf(stderr, "Invalid norm setting.\n"); break; case cudaErrorCudartUnloading: fprintf(stderr, "Error type: cudaErrorCudartUnloading\n"); fprintf(stderr, "CUDA runtime unloading.\n"); break; case cudaErrorUnknown: fprintf(stderr, "Error type: cudaErrorUnknown\n"); fprintf(stderr, "Unknown error condition.\n"); break; case cudaErrorNotYetImplemented: fprintf(stderr, "Error type: cudaErrorNotYetImplemented\n"); fprintf(stderr, "Function not yet implemented.\n"); break; case cudaErrorMemoryValueTooLarge: fprintf(stderr, "Error type: cudaErrorMemoryValueTooLarge\n"); fprintf(stderr, "Memory value too large.\n"); break; case cudaErrorInvalidResourceHandle: fprintf(stderr, "Error type: cudaErrorInvalidResourceHandle\n"); fprintf(stderr, "Invalid resource handle.\n"); break; case cudaErrorNotReady: fprintf(stderr, "Error type: cudaErrorNotReady\n"); fprintf(stderr, "Not ready error.\n"); break; case cudaErrorInsufficientDriver: fprintf(stderr, "Error type: cudaErrorInsufficientDriver\n"); fprintf(stderr, "CUDA runtime is newer than driver.\n"); break; case cudaErrorSetOnActiveProcess: fprintf(stderr, "Error type: cudaErrorSetOnActiveProcess\n"); fprintf(stderr, "Set on active process error.\n"); break; case cudaErrorNoDevice: fprintf(stderr, "Error type: cudaErrorNoDevice\n"); fprintf(stderr, "No available CUDA device.\n"); break; } #endif cudaDeviceReset(); exit(EXIT_FAILURE); } }
void checkCudaError(int line) { checkCudaError(line, cudaPeekAtLastError()); }
/** * This function does what it says on the tin. */ void OptiXRenderer::performRender(long long int photons, int argc_mpi, char* argv_mpi[], int width, int height, float film_location) { // Keep track of time timeval tic; // Create OptiX context optix::Context context = optix::Context::create(); context->setRayTypeCount( 1 ); // Debug, this will make everything SLOOOOOW context->setPrintEnabled(false); // Set some CUDA flags cudaSetDeviceFlags(cudaDeviceMapHost | cudaDeviceLmemResizeToMax); // Set used devices int tmp[] = { 0, 1 }; std::vector<int> v( tmp, tmp+2 ); context->setDevices(v.begin(), v.end()); // Report device usage int num_devices = context->getEnabledDeviceCount(); printf("Using %d devices:\n", num_devices); std::vector<int> enabled_devices = context->getEnabledDevices(); for(int i=0;i<num_devices;i++) { printf(" Device #%d [%s]\n", enabled_devices[i], context->getDeviceName(enabled_devices[i]).c_str()); } // Set some OptiX variables context->setStackSize(4096); // Report OptiX infomation int stack_size_in_bytes = context->getStackSize(); printf("Optix stack size is %d bytes (~%d KB).\n.", stack_size_in_bytes, stack_size_in_bytes/1024); // Declare some variables int threads = 500000; //20000000; unsigned int iterations_on_device = 1; // Set some scene-wide variables context["photon_ray_type"]->setUint( 0u ); context["scene_bounce_limit"]->setUint( 10u ); context["scene_epsilon"]->setFloat( 1.e-4f ); context["iterations"]->setUint(iterations_on_device); context["follow_photon"]->setInt(66752); // Convert our existing scene into an OptiX one convertToOptiXScene(context, width, height, film_location); // Report infomation printf("Rendering with:\n"); printf(" %lld photons.\n", photons); printf(" %d threads.\n", threads); printf(" %d iterations per thread.\n", iterations_on_device); int launches = (photons/threads)/iterations_on_device; if(launches*threads*iterations_on_device<photons) { launches++; printf(" NOTE: You have asked for %lld photons, we are providing %lld photons instead.\n", photons, launches*threads*iterations_on_device); } printf(" %d optix launches.\n", launches); // Create buffer for random numbers optix::Buffer random_buffer = context->createBufferForCUDA( RT_BUFFER_INPUT_OUTPUT | RT_BUFFER_GPU_LOCAL, RT_FORMAT_USER, threads ); random_buffer->setElementSize(sizeof(curandState)); curandState* states_ptr[num_devices]; // Intalise for(int i=0;i<num_devices;i++) { int device_id = enabled_devices[i]; long memory_in_bytes = threads * sizeof(curandState); long memory_in_megabytes = memory_in_bytes/(1024*1024); printf("Allocating %ld bytes (~%ld MB) of memory on device #%d for random states...\n", memory_in_bytes, memory_in_megabytes, device_id); gettimeofday(&tic, NULL); cudaSetDevice(device_id); cudaMalloc((void **)&states_ptr[i], memory_in_bytes); done(tic); CUDAWrapper executer; executer.curand_setup(threads, (void **)&states_ptr[i], time(NULL), i); } // Set as buffer on context context["states"]->set(random_buffer); // Wait printf("Waiting for random states to initalise...\n"); gettimeofday(&tic, NULL); for(int i=0;i<num_devices;i++) { cudaSetDevice(enabled_devices[i]); sync_all_threads(); } done(tic); // Bind to the OptiX buffer // We do this here because it cases a syncronise apparently for(int i=0;i<num_devices;i++) { random_buffer->setDevicePointer(enabled_devices[i], (CUdeviceptr) states_ptr[i]); } // Create Image buffer optix::Buffer buffer = context->createBufferForCUDA( RT_BUFFER_INPUT_OUTPUT | RT_BUFFER_GPU_LOCAL, RT_FORMAT_FLOAT4, width, height ); optix::float4* imgs_ptr[num_devices]; //cudaSetDevice(0); //cudaMalloc((void **)&states_ptr_0, threads * sizeof(curandState)); for(int i=0;i<num_devices;i++) { int device_id = enabled_devices[i]; long memory_in_bytes = width * height * sizeof(optix::float4); long memory_in_megabytes = memory_in_bytes/(1024*1024); printf("Allocating %ld bytes (~%ld MB) of memory on device #%d for image result...\n", memory_in_bytes, memory_in_megabytes, device_id); gettimeofday(&tic, NULL); cudaSetDevice(device_id); cudaMalloc((void **)&imgs_ptr[i], memory_in_bytes); done(tic); CUDAWrapper executer; executer.img_setup((void **)&imgs_ptr[i], width, height); } // Set as buffer on context context["output_buffer"]->set(buffer); // Wait for everytyhing to execute printf("Waiting for Image data to initalise...\n"); gettimeofday(&tic, NULL); for(int i=0;i<num_devices;i++) { cudaSetDevice(enabled_devices[i]); sync_all_threads(); } done(tic); // Bind to the OptiX buffer // We do this here because it cases a syncronise apparently for(int i=0;i<num_devices;i++) { buffer->setDevicePointer(enabled_devices[i], (CUdeviceptr) imgs_ptr[i]); } // Construct MPI int size, rank = 0; #ifndef PHOTON_MPI (void)size; (void)argc_mpi; (void)argv_mpi; #endif #ifdef PHOTON_MPI MPI::Init( argc_mpi, argv_mpi ); //MPI_Get_processor_name(hostname,&strlen); rank = MPI::COMM_WORLD.Get_rank(); size = MPI::COMM_WORLD.Get_size(); printf("Hello, world; from process %d of %d\n", rank, size); // Adjust number of photons for MPI long long int long_size = (long long int) size; photons = photons/long_size; if(rank==0) printf("MPI adjusted to %lld photons per thread", photons); #endif /* MPI */ // Validate try{ context->validate(); }catch(Exception& e){ printf("Validate error!\n"); printf(" CUDA says : %s\n", cudaGetErrorString(cudaPeekAtLastError())); printf(" OptiX says : %s\n", e.getErrorString().c_str() ); return; } // Compile context try{ context->compile(); }catch(Exception& e){ printf("Compile error!\n"); printf(" CUDA says : %s\n", cudaGetErrorString(cudaPeekAtLastError())); printf(" OptiX says : %s\n", e.getErrorString().c_str() ); return; } // Render int current_launch = 0; try{ printf("Begin render...\n"); gettimeofday(&tic, NULL); for(current_launch=0;current_launch<launches;current_launch++) { printf(" ... %f percent\n", 100*((current_launch*1.0f*threads)/photons)); context->launch(0 , threads ); } done(tic); }catch(Exception& e){ printf("Launch error on launch #%d!\n", current_launch); printf(" CUDA says : %s\n", cudaGetErrorString(cudaPeekAtLastError())); printf(" OptiX says : %s\n", e.getErrorString().c_str() ); return; } #ifndef PHOTON_MPI #endif /* If not MPI */ #ifdef PHOTON_MPI // Create MPI handles accImg = new Image(img->getWidth(), img->getHeight()); MPI::Win window_r; MPI::Win window_g; MPI::Win window_b; // Construct an MPI Window to copy some data into, one for each colour. int size_in_bytes = sizeof(float)*img->getWidth()*img->getHeight(); window_r = MPI::Win::Create(accImg->imageR, size_in_bytes, sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD); window_g = MPI::Win::Create(accImg->imageG, size_in_bytes, sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD); window_b = MPI::Win::Create(accImg->imageB, size_in_bytes, sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD); // Perform transfer window_r.Fence(0); window_g.Fence(0); window_b.Fence(0); window_r.Accumulate( img->imageR, img->getWidth()*img->getHeight(), MPI_FLOAT, 0, 0, img->getWidth()*img->getHeight(), MPI_FLOAT, MPI_SUM ); window_g.Accumulate( img->imageG, img->getWidth()*img->getHeight(), MPI_FLOAT, 0, 0, img->getWidth()*img->getHeight(), MPI_FLOAT, MPI_SUM ); window_b.Accumulate( img->imageB, img->getWidth()*img->getHeight(), MPI_FLOAT, 0, 0, img->getWidth()*img->getHeight(), MPI_FLOAT, MPI_SUM ); window_r.Fence(0); window_g.Fence(0); window_b.Fence(0); window_r.Free(); #endif /* MPI */ // Output the image if(rank==0) { // Construct filename char sbuffer[100]; sprintf(sbuffer, "photons-%d.ppm", 0); // This is the collected image data on the host optix::float4* img_host_ptr = (optix::float4*) malloc(width*height*sizeof(optix::float4)); // If we have more than one device we have to accumulate everything back into one buffer if(num_devices == 1) { img_host_ptr = (optix::float4*) malloc(width*height*sizeof(optix::float4)); cudaMemcpy(img_host_ptr, imgs_ptr[0], width*height*sizeof(optix::float4), cudaMemcpyDeviceToHost); } else { printf("We are using %d GPUs, accumulating result...", num_devices); gettimeofday(&tic, NULL); // Create an accumulate buffer on GPU #0# // int device_id = enabled_devices[0]; // cudaSetDevice(device_id); // optix::float4* accumulate_dev_ptr; // cudaMalloc((void **)&accumulate_dev_ptr, width*height*sizeof(optix::float4)); // // put the array of memory ptrs on device 0 // optix::float4** ptrs_dev_ptr; // cudaMalloc((void **)&ptrs_dev_ptr, num_devices*sizeof(optix::float4*)); // // Copy data over // cudaMemcpy(ptrs_dev_ptr, imgs_ptr, num_devices*sizeof(optix::float4*), cudaMemcpyHostToDevice); // CUDAWrapper executer; // executer.img_accumulate((void ***)&ptrs_dev_ptr, (void **)&accumulate_dev_ptr, num_devices, width, height); // cudaMemcpy(img_host_ptr, accumulate_dev_ptr, width*height*sizeof(optix::float4), cudaMemcpyDeviceToHost); // Copy everything to host and accumulate here optix::float4* host_buffers[num_devices]; for(int i=0;i<num_devices;i++) { host_buffers[i] = (optix::float4*) malloc(width*height*sizeof(optix::float4)); cudaMemcpy(host_buffers[i], imgs_ptr[i], width*height*sizeof(optix::float4), cudaMemcpyDeviceToHost); } // Acumulate for(int i=0;i<width*height;i++) { img_host_ptr[i] = make_float4(0, 0, 0, 0); for(int j=0;j<num_devices;j++) { img_host_ptr[i].x += host_buffers[j][i].x; img_host_ptr[i].y += host_buffers[j][i].y; img_host_ptr[i].z += host_buffers[j][i].z; img_host_ptr[i].w += host_buffers[j][i].w; } } for(int i=0;i<num_devices;i++) { free(host_buffers[i]); } done(tic); } printf("Saving Image to %s...\n", sbuffer); gettimeofday(&tic, NULL); saveToPPMFile(sbuffer, img_host_ptr, width, height); free(img_host_ptr); done(tic); } #ifdef PHOTON_MPI // Teardown MPI MPI::Finalize(); #endif /* MPI */ }
/*===========================================================================*/ cudaError_t PeekAtLastError() { return cudaPeekAtLastError(); }
void _cudaCheck(const char* file, int line) { _cudaAssert(cudaPeekAtLastError(), file, line); }
void cuda_check_last_error() { check_cuda_error( cudaPeekAtLastError()); }