Пример #1
0
pbf_dam_sim::pbf_dam_sim(scalar_t space)
{
	init_cond = make_shared<pbf_dam_init_cond>(space);
	init_cond->getParameter(simulatee.parameter);
	init_cond->getExternalForce(simulatee.external);
	auto domain = init_cond->getDomainRange();
	const auto num_capacity = 40000;
	simulatee.allocate(num_capacity, domain.second);
	buffer.allocate(num_capacity, simulatee.ns->getMaxPairParticleNum());
	vector<glm::vec3> x;
	vector<glm::vec3> v;
	init_cond->getDomainParticlePhaseHost(x, v);
	simulatee.phase.num = x.size();
	cout << "initial particle number: " << x.size() << endl;

#pragma region gl_interpo_init

	swVBOs::getInstance().enroll("pbf_particle");
	glBindBuffer(GL_ARRAY_BUFFER, swVBOs::getInstance().findVBO("pbf_particle"));
	glBufferData(GL_ARRAY_BUFFER, x.size() * sizeof(glm::vec3), x.data(), GL_STATIC_DRAW);
	gpuErrchk(cudaDeviceSynchronize());
	cudaGraphicsGLRegisterBuffer(&cu_res, swVBOs::getInstance().findVBO("pbf_particle"), cudaGraphicsMapFlagsNone);
#ifndef NDEBUG
	gpuErrchk(cudaPeekAtLastError());
	gpuErrchk(cudaDeviceSynchronize());
#endif

	// modify particles data
	cudaFree(simulatee.phase.x);
	cudaMemcpy(simulatee.phase.v, v.data(), v.size() * sizeof(dom_dim), cudaMemcpyHostToDevice);
#pragma endregion
}
Пример #2
0
void oskar_device_check_error(int* status)
{
    if (*status) return;

#ifdef OSKAR_HAVE_CUDA
    *status = (int) cudaPeekAtLastError();
#endif
}
Пример #3
0
void pbf_dam_sim::simulateOneStep()
{
	//simulatee.external.body_force

	cudaGraphicsMapResources(1, &cu_res);
#ifndef NDEBUG
	gpuErrchk(cudaPeekAtLastError());
	gpuErrchk(cudaDeviceSynchronize());
#endif
	size_t pos_size;
	cudaGraphicsResourceGetMappedPointer((void**)&simulatee.phase.x, &pos_size, cu_res);
#ifndef NDEBUG
	gpuErrchk(cudaPeekAtLastError());
	gpuErrchk(cudaDeviceSynchronize());
#endif
	one_step(simulatee, buffer, domain, 3);
	cudaGraphicsUnmapResources(1, &cu_res);
	simulatee.phase.x = NULL;
}
Пример #4
0
int main() {
  // Initialize variables
  double *h_u; h_u = (double*)malloc(sizeof(double)*(NX*NY*NZ));

  // Set Domain Initial Condition and BCs
  Call_IC(h_u);

  // GPU Memory Arrays
  double *d_u;  checkCuda(cudaMalloc((void**)&d_u, sizeof(double)*(NX*NY)));
  double *d_un; checkCuda(cudaMalloc((void**)&d_un,sizeof(double)*(NX*NY)));

  // Copy Initial Condition from host to device
  checkCuda(cudaMemcpy(d_u, h_u,sizeof(double)*(NX*NY),cudaMemcpyHostToDevice));
  checkCuda(cudaMemcpy(d_un,h_u,sizeof(double)*(NX*NY),cudaMemcpyHostToDevice));

  // GPU kernel launch parameters
  dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z);
  dim3 dimGrid (DIVIDE_INTO(NX, BLOCK_SIZE_X), DIVIDE_INTO(NY, BLOCK_SIZE_Y), DIVIDE_INTO(NZ, BLOCK_SIZE_Z)); 

  // Request computer current time
  time_t t = clock();

  // Solver Loop 
  for (int step=0; step < NO_STEPS; step+=2) {
    if (step%10000==0) printf("Step %d of %d\n",step,(int)NO_STEPS);
      // Compute Laplace
      Call_Laplace(dimGrid,dimBlock,d_u,d_un);
      // Call_Laplace_Texture(dimGrid,dimBlock,d_u,d_un);
    }
  if (DEBUG) printf("CUDA error (Jacobi_Method) %s\n",cudaGetErrorString(cudaPeekAtLastError()));

  // Measure and Report computation time
  t = clock()-t; printf("Computing time (%f seconds).\n",((float)t)/CLOCKS_PER_SEC);

  // Copy data from device to host
  checkCuda(cudaMemcpy(h_u,d_u,sizeof(double)*(NX*NY*NZ),cudaMemcpyDeviceToHost));

  // uncomment to print solution to terminal
  if (DEBUG) Print2D(h_u);

  // Write solution to file
  Save_Results(h_u); 

  // Free device memory
  checkCuda(cudaFree(d_u));
  checkCuda(cudaFree(d_un));

  // Reset device
  checkCuda(cudaDeviceReset());

  // Free memory on host and device
  free(h_u);

  return 0;
}
Пример #5
0
void cuda_random(float *x_gpu, size_t n)
{
    static curandGenerator_t gen;
    static int init = 0;
    if(!init){
        curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
        curandSetPseudoRandomGeneratorSeed(gen, time(0));
        init = 1;
    }
    curandGenerateUniform(gen, x_gpu, n);
    check_error(cudaPeekAtLastError());
}
Пример #6
0
void cuda_random(float *x_gpu, size_t n) {
	static curandGenerator_t gen[16];
	static int init[16] = { 0 };
	int i = cuda_get_device();
	if (!init[i]) {
		curandCreateGenerator(&gen[i], CURAND_RNG_PSEUDO_DEFAULT);
		curandSetPseudoRandomGeneratorSeed(gen[i], time(0));
		init[i] = 1;
	}
	curandGenerateUniform(gen[i], x_gpu, n);
	check_error(cudaPeekAtLastError());
}
Пример #7
0
 void cudaCheck(const int line, const std::string& function, const std::string& file)
 {
     try
     {
         #ifdef USE_CUDA
             const auto errorCode = cudaPeekAtLastError();
             if(errorCode != cudaSuccess)
                 error("Cuda check failed (" + std::to_string(errorCode) + " vs. " + std::to_string(cudaSuccess) + "): "
                       + cudaGetErrorString(errorCode), line, function, file);
         #else
             UNUSED(line);
             UNUSED(function);
             UNUSED(file);
             error("OpenPose must be compiled with the `USE_CUDA` macro definition in order to use this"
                   " functionality.", __LINE__, __FUNCTION__, __FILE__);
         #endif
     }
     catch (const std::exception& e)
     {
         error(e.what(), __LINE__, __FUNCTION__, __FILE__);
     }
 }
Пример #8
0
extern inline void _cudaCheckError(const char *file, const int line) {
#ifndef NDEBUG
  cudaDeviceSynchronize();
#endif
  cudaError_t ret = cudaPeekAtLastError();
  if (ret != cudaSuccess) {
    fprintf(stderr, "cudaCheckError() failed at %s:%i: %s\n", file, line,
            cudaGetErrorString(cudaGetLastError()));
#ifndef NDEBUG
    switch (ret) {
    case cudaErrorMissingConfiguration:
      fprintf(stderr, "Error type: cudaErrorMissingConfiguration\n");
      fprintf(stderr, "Missing configuration error.\n");
      break;
    case cudaErrorMemoryAllocation:
      fprintf(stderr, "Error type: cudaErrorMemoryAllocation\n");
      fprintf(stderr, "Memory allocation error.\n");
      break;
    case cudaErrorInitializationError:
      fprintf(stderr, "Error type: cudaErrorInitializationError\n");
      fprintf(stderr, "Initialization error.\n");
      break;
    case cudaErrorLaunchFailure:
      fprintf(stderr, "Error type: cudaErrorLaunchFailure\n");
      fprintf(stderr, "Launch failure.\n");
      break;
    case cudaErrorPriorLaunchFailure:
      fprintf(stderr, "Error type: cudaErrorPriorLaunchFailure\n");
      fprintf(stderr, "Prior launch failure.\n");
      break;
    case cudaErrorLaunchTimeout:
      fprintf(stderr, "Error type: cudaErrorLaunchTimeout\n");
      fprintf(stderr, "Launch timeout error.\n");
      break;
    case cudaErrorLaunchOutOfResources:
      fprintf(stderr, "Error type: cudaErrorLaunchOutOfResources\n");
      fprintf(stderr, "Launch out of resources error.\n");
      break;
    case cudaErrorInvalidDeviceFunction:
      fprintf(stderr, "Error type: cudaErrorInvalidDeviceFunction\n");
      fprintf(stderr, "Invalid device function.\n");
      break;
    case cudaErrorInvalidConfiguration:
      fprintf(stderr, "Error type: cudaErrorInvalidConfiguration\n");
      fprintf(stderr, "Invalid configuration.\n");
      break;
    case cudaErrorInvalidDevice:
      fprintf(stderr, "Error type: cudaErrorInvalidDevice\n");
      fprintf(stderr, "Invalid device.\n");
      break;
    case cudaErrorInvalidValue:
      fprintf(stderr, "Error type: cudaErrorInvalidValue\n");
      fprintf(stderr, "Invalid value.\n");
      break;
    case cudaErrorInvalidPitchValue:
      fprintf(stderr, "Error type: cudaErrorInvalidPitchValue\n");
      fprintf(stderr, "Invalid pitch value.\n");
      break;
    case cudaErrorInvalidSymbol:
      fprintf(stderr, "Error type: cudaErrorInvalidSymbol\n");
      fprintf(stderr, "Invalid symbol.\n");
      break;
    case cudaErrorMapBufferObjectFailed:
      fprintf(stderr, "Error type: cudaErrorMapBufferObjectFailed\n");
      fprintf(stderr, "Map buffer object failed.\n");
      break;
    case cudaErrorUnmapBufferObjectFailed:
      fprintf(stderr, "Error type: cudaErrorUnmapBufferObjectFailed\n");
      fprintf(stderr, "Unmap buffer object failed.\n");
      break;
    case cudaErrorInvalidHostPointer:
      fprintf(stderr, "Error type: cudaErrorInvalidHostPointer\n");
      fprintf(stderr, "Invalid host pointer.\n");
      break;
    case cudaErrorInvalidDevicePointer:
      fprintf(stderr, "Error type: cudaErrorInvalidDevicePointer\n");
      fprintf(stderr, "Invalid device pointer.\n");
      break;
    case cudaErrorInvalidTexture:
      fprintf(stderr, "Error type: cudaErrorInvalidTexture\n");
      fprintf(stderr, "Invalid device pointer.\n");
      break;
    case cudaErrorInvalidTextureBinding:
      fprintf(stderr, "Error type: cudaErrorInvalidTextureBinding\n");
      fprintf(stderr, "Invalid texture binding.\n");
      break;
    case cudaErrorInvalidChannelDescriptor:
      fprintf(stderr, "Error type: cudaErrorInvalidChannelDescriptor\n");
      fprintf(stderr, "Invalid channel descriptor.\n");
      break;
    case cudaErrorInvalidMemcpyDirection:
      fprintf(stderr, "Error type: cudaErrorInvalidMemcpyDirection\n");
      fprintf(stderr, "Invalid memcpy direction.\n");
      break;
    case cudaErrorAddressOfConstant:
      fprintf(stderr, "Error type: cudaErrorAddressOfConstant\n");
      fprintf(stderr, "Address of constant error.\n");
      break;
    case cudaErrorTextureFetchFailed:
      fprintf(stderr, "Error type: cudaErrorTextureFetchFailed\n");
      fprintf(stderr, "Texture fetch failed.\n");
      break;
    case cudaErrorTextureNotBound:
      fprintf(stderr, "Error type: cudaErrorTextureNotBound\n");
      fprintf(stderr, "Texture not bound error.\n");
      break;
    case cudaErrorSynchronizationError:
      fprintf(stderr, "Error type: cudaErrorSynchronizationError\n");
      fprintf(stderr, "Synchronization error.\n");
      break;
    case cudaErrorInvalidFilterSetting:
      fprintf(stderr, "Error type: cudaErrorInvalidFilterSetting\n");
      fprintf(stderr, "Invalid filter setting.\n");
      break;
    case cudaErrorInvalidNormSetting:
      fprintf(stderr, "Error type: cudaErrorInvalidNormSetting\n");
      fprintf(stderr, "Invalid norm setting.\n");
      break;
    case cudaErrorCudartUnloading:
      fprintf(stderr, "Error type: cudaErrorCudartUnloading\n");
      fprintf(stderr, "CUDA runtime unloading.\n");
      break;
    case cudaErrorUnknown:
      fprintf(stderr, "Error type: cudaErrorUnknown\n");
      fprintf(stderr, "Unknown error condition.\n");
      break;
    case cudaErrorNotYetImplemented:
      fprintf(stderr, "Error type: cudaErrorNotYetImplemented\n");
      fprintf(stderr, "Function not yet implemented.\n");
      break;
    case cudaErrorMemoryValueTooLarge:
      fprintf(stderr, "Error type: cudaErrorMemoryValueTooLarge\n");
      fprintf(stderr, "Memory value too large.\n");
      break;
    case cudaErrorInvalidResourceHandle:
      fprintf(stderr, "Error type: cudaErrorInvalidResourceHandle\n");
      fprintf(stderr, "Invalid resource handle.\n");
      break;
    case cudaErrorNotReady:
      fprintf(stderr, "Error type: cudaErrorNotReady\n");
      fprintf(stderr, "Not ready error.\n");
      break;
    case cudaErrorInsufficientDriver:
      fprintf(stderr, "Error type: cudaErrorInsufficientDriver\n");
      fprintf(stderr, "CUDA runtime is newer than driver.\n");
      break;
    case cudaErrorSetOnActiveProcess:
      fprintf(stderr, "Error type: cudaErrorSetOnActiveProcess\n");
      fprintf(stderr, "Set on active process error.\n");
      break;
    case cudaErrorNoDevice:
      fprintf(stderr, "Error type: cudaErrorNoDevice\n");
      fprintf(stderr, "No available CUDA device.\n");
      break;
    }
#endif
    cudaDeviceReset();
    exit(EXIT_FAILURE);
  }
}
Пример #9
0
void checkCudaError(int line) {
    checkCudaError(line, cudaPeekAtLastError());
}
Пример #10
0
/**
 * This function does what it says on the tin.
 */
void OptiXRenderer::performRender(long long int photons, int argc_mpi, char* argv_mpi[], int width, int height, float film_location) {
	// Keep track of time
	timeval tic;

	// Create OptiX context
	optix::Context context = optix::Context::create();
	context->setRayTypeCount( 1 );

	// Debug, this will make everything SLOOOOOW
	context->setPrintEnabled(false);

	// Set some CUDA flags
	cudaSetDeviceFlags(cudaDeviceMapHost | cudaDeviceLmemResizeToMax);

	// Set used devices
	int tmp[] = { 0, 1 };
	std::vector<int> v( tmp, tmp+2 );
	context->setDevices(v.begin(), v.end());

	// Report device usage
	int num_devices = context->getEnabledDeviceCount();
	printf("Using %d devices:\n", num_devices);
	std::vector<int> enabled_devices =  context->getEnabledDevices();
	for(int i=0;i<num_devices;i++) {
		printf("    Device #%d [%s]\n", enabled_devices[i], context->getDeviceName(enabled_devices[i]).c_str());
	}

	// Set some OptiX variables
	context->setStackSize(4096);

	// Report OptiX infomation
	int stack_size_in_bytes = context->getStackSize();
	printf("Optix stack size is %d bytes (~%d KB).\n.", stack_size_in_bytes, stack_size_in_bytes/1024);

	// Declare some variables
	int threads = 500000; //20000000;
	unsigned int iterations_on_device = 1;

	// Set some scene-wide variables
	context["photon_ray_type"]->setUint( 0u );
	context["scene_bounce_limit"]->setUint( 10u );
	context["scene_epsilon"]->setFloat( 1.e-4f );
	context["iterations"]->setUint(iterations_on_device);
	context["follow_photon"]->setInt(66752);

	// Convert our existing scene into an OptiX one
	convertToOptiXScene(context, width, height, film_location);

	// Report infomation
	printf("Rendering with:\n");
	printf("    %lld photons.\n", photons);
	printf("    %d threads.\n", threads);
	printf("    %d iterations per thread.\n", iterations_on_device);
	int launches = (photons/threads)/iterations_on_device;
	if(launches*threads*iterations_on_device<photons) {
		launches++;
		printf("    NOTE: You have asked for %lld photons, we are providing %lld photons instead.\n", photons, launches*threads*iterations_on_device);
	}
	printf("    %d optix launches.\n", launches);

	// Create buffer for random numbers
	optix::Buffer random_buffer = context->createBufferForCUDA( RT_BUFFER_INPUT_OUTPUT | RT_BUFFER_GPU_LOCAL, RT_FORMAT_USER, threads );
	random_buffer->setElementSize(sizeof(curandState));
	curandState* states_ptr[num_devices];

	// Intalise
	for(int i=0;i<num_devices;i++) {
		int device_id = enabled_devices[i];
		long memory_in_bytes = threads * sizeof(curandState);
		long memory_in_megabytes = memory_in_bytes/(1024*1024);
		printf("Allocating %ld bytes (~%ld MB) of memory on device #%d for random states...\n", memory_in_bytes, memory_in_megabytes, device_id);
		gettimeofday(&tic, NULL);
		cudaSetDevice(device_id);
		cudaMalloc((void **)&states_ptr[i], memory_in_bytes);
		done(tic);
		CUDAWrapper executer;
		executer.curand_setup(threads, (void **)&states_ptr[i], time(NULL), i);
	}

	// Set as buffer on context
	context["states"]->set(random_buffer);

	// Wait
	printf("Waiting for random states to initalise...\n");
	gettimeofday(&tic, NULL);
	for(int i=0;i<num_devices;i++) {
		cudaSetDevice(enabled_devices[i]);
		sync_all_threads();
	}
	done(tic);

	// Bind to the OptiX buffer
	// We do this here because it cases a syncronise apparently
	for(int i=0;i<num_devices;i++) {
		random_buffer->setDevicePointer(enabled_devices[i], (CUdeviceptr) states_ptr[i]);
	}

	// Create Image buffer
	optix::Buffer buffer = context->createBufferForCUDA( RT_BUFFER_INPUT_OUTPUT | RT_BUFFER_GPU_LOCAL, RT_FORMAT_FLOAT4, width, height );
	optix::float4* imgs_ptr[num_devices];

	//cudaSetDevice(0);
	//cudaMalloc((void **)&states_ptr_0, threads * sizeof(curandState));
	for(int i=0;i<num_devices;i++) {
		int device_id = enabled_devices[i];
		long memory_in_bytes = width * height * sizeof(optix::float4);
		long memory_in_megabytes = memory_in_bytes/(1024*1024);
		printf("Allocating %ld bytes (~%ld MB) of memory on device #%d for image result...\n", memory_in_bytes, memory_in_megabytes, device_id);
		gettimeofday(&tic, NULL);
		cudaSetDevice(device_id);
		cudaMalloc((void **)&imgs_ptr[i], memory_in_bytes);
		done(tic);
		CUDAWrapper executer;
		executer.img_setup((void **)&imgs_ptr[i], width, height);
	}

	// Set as buffer on context
	context["output_buffer"]->set(buffer);

	// Wait for everytyhing to execute
	printf("Waiting for Image data to initalise...\n");
	gettimeofday(&tic, NULL);
	for(int i=0;i<num_devices;i++) {
		cudaSetDevice(enabled_devices[i]);
		sync_all_threads();
	}
	done(tic);

	// Bind to the OptiX buffer
	// We do this here because it cases a syncronise apparently
	for(int i=0;i<num_devices;i++) {
		buffer->setDevicePointer(enabled_devices[i], (CUdeviceptr) imgs_ptr[i]);
	}

	// Construct MPI
	int size, rank = 0;
	#ifndef PHOTON_MPI
	(void)size;
	(void)argc_mpi;
	(void)argv_mpi;
	#endif

	#ifdef PHOTON_MPI
		MPI::Init( argc_mpi, argv_mpi );

		//MPI_Get_processor_name(hostname,&strlen);
		rank = MPI::COMM_WORLD.Get_rank();
		size = MPI::COMM_WORLD.Get_size();
		printf("Hello, world; from process %d of %d\n", rank, size);

		// Adjust number of photons for MPI
		long long int long_size = (long long int) size;
		photons = photons/long_size;
		if(rank==0)	printf("MPI adjusted to %lld photons per thread", photons);
	#endif /* MPI */

	// Validate
	try{
		context->validate();
	}catch(Exception& e){
		printf("Validate error!\n");
		printf("    CUDA says  : %s\n",  cudaGetErrorString(cudaPeekAtLastError()));
		printf("    OptiX says : %s\n", e.getErrorString().c_str() );
		return;
	}

	// Compile context
	try{
		context->compile();
	}catch(Exception& e){
		printf("Compile error!\n");
		printf("    CUDA says  : %s\n",  cudaGetErrorString(cudaPeekAtLastError()));
		printf("    OptiX says : %s\n", e.getErrorString().c_str() );
		return;
	}

	// Render
	int current_launch = 0;
	try{
		printf("Begin render...\n");
		gettimeofday(&tic, NULL);
		for(current_launch=0;current_launch<launches;current_launch++) {
			printf("    ... %f percent\n", 100*((current_launch*1.0f*threads)/photons));
			context->launch(0 , threads );
		}
		done(tic);
	}catch(Exception& e){
		printf("Launch error on launch #%d!\n", current_launch);
		printf("    CUDA says  : %s\n",  cudaGetErrorString(cudaPeekAtLastError()));
		printf("    OptiX says : %s\n", e.getErrorString().c_str() );
		return;
	}

	#ifndef PHOTON_MPI

	#endif /* If not MPI */
	#ifdef PHOTON_MPI

		// Create MPI handles
		accImg = new Image(img->getWidth(), img->getHeight());
		MPI::Win window_r;
		MPI::Win window_g;
		MPI::Win window_b;

		// Construct an MPI Window to copy some data into, one for each colour.
		int size_in_bytes = sizeof(float)*img->getWidth()*img->getHeight();
		window_r = MPI::Win::Create(accImg->imageR, size_in_bytes, sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD);
		window_g = MPI::Win::Create(accImg->imageG, size_in_bytes, sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD);
		window_b = MPI::Win::Create(accImg->imageB, size_in_bytes, sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD);

		// Perform transfer
		window_r.Fence(0);
		window_g.Fence(0);
		window_b.Fence(0);
		window_r.Accumulate(
				img->imageR,
				img->getWidth()*img->getHeight(),
				MPI_FLOAT,
				0,
				0,
				img->getWidth()*img->getHeight(),
				MPI_FLOAT,
				MPI_SUM
		);
		window_g.Accumulate(
				img->imageG,
				img->getWidth()*img->getHeight(),
				MPI_FLOAT,
				0,
				0,
				img->getWidth()*img->getHeight(),
				MPI_FLOAT,
				MPI_SUM
		);
		window_b.Accumulate(
				img->imageB,
				img->getWidth()*img->getHeight(),
				MPI_FLOAT,
				0,
				0,
				img->getWidth()*img->getHeight(),
				MPI_FLOAT,
				MPI_SUM
		);
		window_r.Fence(0);
		window_g.Fence(0);
		window_b.Fence(0);
		window_r.Free();
	#endif /* MPI */
	// Output the image
	if(rank==0) {
		// Construct filename
		char sbuffer[100];
		sprintf(sbuffer, "photons-%d.ppm", 0);
		// This is the collected image data on the host
		optix::float4* img_host_ptr = (optix::float4*) malloc(width*height*sizeof(optix::float4));
		// If we have more than one device we have to accumulate everything back into one buffer
		if(num_devices == 1) {
			img_host_ptr = (optix::float4*) malloc(width*height*sizeof(optix::float4));
			cudaMemcpy(img_host_ptr, imgs_ptr[0], width*height*sizeof(optix::float4), cudaMemcpyDeviceToHost);
		} else {
			printf("We are using %d GPUs, accumulating result...", num_devices);
			gettimeofday(&tic, NULL);
			// Create an accumulate buffer on GPU #0#
// 			int device_id = enabled_devices[0];
// 			cudaSetDevice(device_id);
// 			optix::float4* accumulate_dev_ptr;
// 			cudaMalloc((void **)&accumulate_dev_ptr, width*height*sizeof(optix::float4));
// 			// put the array of memory ptrs on device 0
// 			optix::float4** ptrs_dev_ptr;
// 			cudaMalloc((void **)&ptrs_dev_ptr, num_devices*sizeof(optix::float4*));
// 			// Copy data over
// 			cudaMemcpy(ptrs_dev_ptr, imgs_ptr, num_devices*sizeof(optix::float4*), cudaMemcpyHostToDevice);
// 			CUDAWrapper executer;
// 			executer.img_accumulate((void ***)&ptrs_dev_ptr, (void **)&accumulate_dev_ptr, num_devices, width, height);
// 			cudaMemcpy(img_host_ptr, accumulate_dev_ptr, width*height*sizeof(optix::float4), cudaMemcpyDeviceToHost);
			// Copy everything to host and accumulate here
			optix::float4* host_buffers[num_devices];
			for(int i=0;i<num_devices;i++) {
				host_buffers[i] = (optix::float4*) malloc(width*height*sizeof(optix::float4));
				cudaMemcpy(host_buffers[i], imgs_ptr[i], width*height*sizeof(optix::float4), cudaMemcpyDeviceToHost);
			}
			// Acumulate
			for(int i=0;i<width*height;i++) {
				img_host_ptr[i] = make_float4(0, 0, 0, 0);
				for(int j=0;j<num_devices;j++) {
					img_host_ptr[i].x += host_buffers[j][i].x;
					img_host_ptr[i].y += host_buffers[j][i].y;
					img_host_ptr[i].z += host_buffers[j][i].z;
					img_host_ptr[i].w += host_buffers[j][i].w;
				}
			}
			for(int i=0;i<num_devices;i++) {
				free(host_buffers[i]);
			}
			done(tic);
		}
		printf("Saving Image to %s...\n", sbuffer);
		gettimeofday(&tic, NULL);
		saveToPPMFile(sbuffer, img_host_ptr, width, height);
		free(img_host_ptr);
		done(tic);
	}
	#ifdef PHOTON_MPI
		// Teardown MPI
		MPI::Finalize();
	#endif /* MPI */
}
Пример #11
0
/*===========================================================================*/
cudaError_t PeekAtLastError()
{
    return cudaPeekAtLastError();
}
Пример #12
0
void _cudaCheck(const char* file, int line)
{
   _cudaAssert(cudaPeekAtLastError(), file, line);
}
Пример #13
0
void cuda_check_last_error()
{
    check_cuda_error( cudaPeekAtLastError());
}