Пример #1
0
// Compute 2d pattern on each device.
int process(
	coaccel_device_group group, coaccel_device device,
	int idevice, int ithread, void* arg)
{
	// Unpack my personal config structure.
	config_t* config = (config_t*)arg + idevice;

	// Switch callee, depending on the current device mode.
	switch (coaccel_device_get_mode(device))
	{
		case COACCEL_DEVMODE_CUDA_SYNC :
			coaccel_device_lock(device);
			pattern2d_gpu(1, config->nx, 1, 1, config->ny, 1,
				config->in_dev, config->out_dev, idevice);
			coaccel_device_unlock(device);
			break;
		case COACCEL_DEVMODE_CPU_SYNC :
			pattern2d_cpu(1, config->nx, 1, 1, config->ny, 1,
				config->in_dev, config->out_dev, idevice);
	}

	config->step++;
	
	// Swap device input and output buffers.
	float* swap = config->in_dev;
	config->in_dev = config->out_dev;
	config->out_dev = swap;

	printf("Device %d completed step %d\n", idevice, config->step);

	return 0;
}
Пример #2
0
int main(int argc, char* atgv[])
{
    int ndevices = 0;
    cudaError_t cuda_status = cudaGetDeviceCount(&ndevices);
    if (cuda_status != cudaSuccess)
    {
        fprintf(stderr, "Cannot get the cuda device count, status = %d: %s\n",
                cuda_status, cudaGetErrorString(cuda_status));
        return cuda_status;
    }

    // Return if no cuda devices present.
    printf("%d CUDA device(s) found\n", ndevices);
    if (!ndevices) return 0;

    // Create input data. Each device will have an equal
    // piece of data.
    size_t np = nx * ny, size = np * sizeof(float);
    float* data = (float*)malloc(size * 2);
    float *input = data, *output = data + np;
    float invdrandmax = 1.0 / RAND_MAX;
    for (size_t i = 0; i < np; i++)
        input[i] = rand() * invdrandmax;

    struct time_t start, finish;
    get_time(&start);

    // Get control result on CPU (to compare with results on devices).
    pattern2d_cpu(1, nx, 1, 1, ny, 1, input, output, ndevices);

    get_time(&finish);

    printf("CPU time = %f sec\n", get_time_diff(&start, &finish));

    // Create config structures to store device-specific
    // values.
    config_t* configs = (config_t*)malloc(
                            sizeof(config_t) * ndevices);

    // Initialize CUDA devices.
    for (int idevice = 0; idevice < ndevices; idevice++)
    {
        config_t* config = configs + idevice;

        // TODO: Set curent CUDA device to idevice.
        cudaError_t cuda_status = cudaSetDevice(idevice);
        if (cuda_status != cudaSuccess) {
            fprintf(stderr, "Cannot get the cuda device count, status = %d: %s\n",
                    cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }

        // Create device arrays for input and output data.
        cuda_status = cudaMalloc((void**)&config->in_dev, size);
        if (cuda_status != cudaSuccess)
        {
            fprintf(stderr, "Cannot allocate CUDA input buffer on device %d, status = %d: %s\n",
                    idevice, cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }
        cuda_status = cudaMalloc((void**)&config->out_dev, size);
        if (cuda_status != cudaSuccess)
        {
            fprintf(stderr, "Cannot allocate CUDA output buffer on device %d, status = %d: %s\n",
                    idevice, cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }

        // Copy input data to device buffer.
        cuda_status = cudaMemcpy(config->in_dev, input, size,
                                 cudaMemcpyHostToDevice);
        if (cuda_status != cudaSuccess)
        {
            fprintf(stderr, "Cannot copy input data to CUDA buffer on device %d, status = %d: %s\n",
                    idevice, cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }

        printf("Device %d initialized\n", idevice);
    }

    // Start execution of kernels. One kernel
    // is executed on each device in parallel.
    for (int idevice = 0; idevice < ndevices; idevice++)
    {
        config_t* config = configs + idevice;

        // TODO: Set curent CUDA device to idevice.
        cudaError_t cuda_status = cudaSetDevice(idevice);

        get_time(&config->start);

        // Run test kernel on the current device.
        int status = pattern2d_gpu(1, nx, 1, 1, ny, 1,
                                   config->in_dev, config->out_dev, idevice);
        if (status)
        {
            fprintf(stderr, "Cannot execute pattern 2d on device %d, status = %d: %s\n",
                    idevice, status, cudaGetErrorString(status));
            return status;
        }
    }

    // Synchronize kernels execution.
    for (int idevice = 0; idevice < ndevices; idevice++)
    {
        config_t* config = configs + idevice;

        // TODO: Set curent CUDA device to idevice.
        cudaError_t cuda_status = cudaSetDevice(idevice);

        // Wait for current device to finish processing
        // the kernels.
        cuda_status = cudaThreadSynchronize();
        if (cuda_status != cudaSuccess)
        {
            fprintf(stderr, "Cannot synchronize thread by device %d, status = %d: %s\n",
                    idevice, cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }

        get_time(&finish);

        printf("GPU %d time = %f sec\n", idevice,
               get_time_diff(&config->start, &finish));
    }

    // Check results and dispose resources used by devices.
    for (int idevice = 0; idevice < ndevices; idevice++)
    {
        config_t* config = configs + idevice;

        // TODO: Set curent CUDA device to idevice.

        // Offload results back to host memory.
        cuda_status = cudaMemcpy(input, config->out_dev, size,
                                 cudaMemcpyDeviceToHost);
        if (cuda_status != cudaSuccess)
        {
            fprintf(stderr, "Cannot copy output data from CUDA buffer on device %d, status = %d: %s\n",
                    idevice, cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }

        // Free device arrays.
        cuda_status = cudaFree(config->in_dev);
        if (cuda_status != cudaSuccess)
        {
            fprintf(stderr, "Cannot release input buffer on device %d, status = %d: %s\n",
                    idevice, cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }
        cuda_status = cudaFree(config->out_dev);
        if (cuda_status != cudaSuccess)
        {
            fprintf(stderr, "Cannot release output buffer on device %d, status = %d: %s\n",
                    idevice, cuda_status, cudaGetErrorString(cuda_status));
            return cuda_status;
        }

        printf("Device %d deinitialized\n", idevice);

        // Compare each GPU result to CPU result.
        // Find the maximum abs difference.
        int maxi = 0, maxj = 0;
        float maxdiff = fabs(input[0] - output[0]);
        for (int j = 0; j < ny; j++)
        {
            for (int i = 0; i < nx; i++)
            {
                float diff = fabs(
                                 input[i + j * nx] - output[i + j * nx]);
                if (diff > maxdiff)
                {
                    maxdiff = diff;
                    maxi = i;
                    maxj = j;
                }
            }
        }

        printf("Device %d result abs max diff = %f @ (%d,%d)\n",
               idevice, maxdiff, maxi, maxj);
    }

    // Measure time between first GPU launch and last GPU
    // finish. This will show how much time is spent on GPU
    // kernels in total.
    // XXX If this time is comparabe to the time of
    // individual GPU, then we likely reached our goal:
    // kernels are executed in parallel.
    printf("Total time of %d GPUs = %f\n", ndevices,
           get_time_diff(&configs[0].start, &finish));

    free(configs);
    free(data);

    return 0;
}
Пример #3
0
int main(int argc, char* argv[])
{
	// Process config (to be filled completely
	// later).
	config_t config;
	config.idevice = 0;
	config.nx = nx;
	config.ny = ny;
	config.step = 0;

	// Create shared memory region.
	int fd = shm_open("/shmem_mmap_cuda_shm",
		O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
	if (fd == -1)
	{
		fprintf(stderr, "Cannot open shared region, errno = %d\n", errno);
		return errno;
	}

	// Create first semaphore (set to 0 to create it initially locked).
	sem_t* sem1 = sem_open("/shmem_mmap_cuda_sem1",
		O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO, 0);
	if (sem1 == SEM_FAILED)
	{
		fprintf(stderr, "Cannot open semaphore #1, errno = %d\n", errno);
		return errno;
	}

	// Create second semaphore (set to 0 to create it initially locked).
	sem_t* sem2 = sem_open("/shmem_mmap_cuda_sem2",
		O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO, 0);
	if (sem2 == SEM_FAILED)
	{
		fprintf(stderr, "Cannot open semaphore #2, errno = %d\n", errno);
		return errno;
	}

	// Call fork to create another process.
	// Standard: "Memory mappings created in the parent
	// shall be retained in the child process."
	pid_t fork_status = fork();
	
	// From this point two processes are running the same code, if no errors.
	if (fork_status == -1)
	{
		fprintf(stderr, "Cannot fork process, errno = %d\n", errno);
		return errno;
	}

	// Get the process ID.
	int pid = (int)getpid();

	// By fork return value we can determine the process role:
	// master or child (worker).
	int master = fork_status ? 1 : 0, worker = !master;

	int ndevices = 0;
	cudaError_t cuda_status = cudaGetDeviceCount(&ndevices);
	if (cuda_status != cudaSuccess)
	{
		fprintf(stderr, "Cannot get the cuda device count by process %d, status = %d\n",
			pid, cuda_status);
		return cuda_status;
	}
	
	// Return if no cuda devices present.
	if (master)
		printf("%d CUDA device(s) found\n", ndevices);
	if (!ndevices) return 0;
	ndevices = 1;

	size_t np = nx * ny;
	size_t size = np * sizeof(float);

	float* inout;
	
	if (!master)
	{
		// Lock semaphore to finish shared region configuration on master.
		int sem_status = sem_wait(sem1);
		if (sem_status == -1)
		{
			fprintf(stderr, "Cannot wait on semaphore by process %d, errno = %d\n",
				pid, errno);
			return errno;
		}

		// Map the shared region into the address space of the current process.
		inout = (float*)mmap(0, size * (ndevices + 1),
			PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
		if (inout == MAP_FAILED)
		{
			fprintf(stderr, "Cannot map shared region to memory by process %d, errno = %d\n",
				pid, errno);
			return errno;
		}
	}
	else
	{
		config.idevice = ndevices;

		// Set shared region size.
		int ftrunk_status = ftruncate(fd, size * (ndevices + 1));
		if (ftrunk_status == -1)
		{
			fprintf(stderr, "Cannot truncate shared region, errno = %d\n", errno);
			return errno;
		}

		// Map the shared region into the address space of the current process.
		inout = (float*)mmap(0, size * (ndevices + 1),
			PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
		if (inout == MAP_FAILED)
		{
			fprintf(stderr, "Cannot map shared region to memory by process %d, errno = %d\n",
				pid, errno);
			return errno;
		}

		// Create input data. Let each device to have an equal piece
		// of single shared data array.
		float invdrandmax = 1.0 / RAND_MAX;
		for (size_t i = 0; i < np; i++)
			inout[i] = rand() * invdrandmax;
		for (int i = 0; i < ndevices; i++)
			memcpy(inout + np * (i + 1), inout, np * sizeof(float));

		// Sync changed content with shared region.
		int msync_status = msync(inout, size * (ndevices + 1), MS_SYNC);
		if (msync_status == -1)
		{
			fprintf(stderr, "Cannot sync shared memory %p, errno = %d\n",
				inout, errno);
			return errno;
		}

		// Unlock semaphore to let other processes to move forward.
		int sem_status = sem_post(sem1);
		if (sem_status == -1)
		{
			fprintf(stderr, "Cannot post on semaphore by process %d, errno = %d\n",
				pid, errno);
			return errno;
		}
	}

	config.inout_cpu = inout + config.idevice * np;

	// Let workers to use CUDA devices, and master - the CPU.
	// Create device buffers.
	if (worker)
	{
		// Create device arrays for input and output data.
		cuda_status = cudaMalloc((void**)&config.in_dev, size);
		if (cuda_status != cudaSuccess)
		{
			fprintf(stderr, "Cannot allocate CUDA input buffer by process %d, status = %d\n",
				pid, cuda_status);
			return cuda_status;
		}
		cuda_status = cudaMalloc((void**)&config.out_dev, size);
		if (cuda_status != cudaSuccess)
		{
			fprintf(stderr, "Cannot allocate CUDA output buffer by process %d, status = %d\n",
				pid, cuda_status);
			return cuda_status;
		}
	}
	else
	{
		// Create device arrays for input and output data.
		config.in_dev = (float*)malloc(size);
		config.out_dev = (float*)malloc(size);
	}

	printf("Device %d initialized py process %d\n", config.idevice, pid);

	// Perform some "iterations" on data arrays, assigned to devices,
	// and shift input data array after each iteration.
	for (int i = 0; i < nticks; i++)
	{
		int status;
		if (master)
		{
			// Copy input data to device buffer.
			memcpy(config.in_dev, config.inout_cpu, size);

			status = pattern2d_cpu(1, config.nx, 1, 1, config.ny, 1,
				config.in_dev, config.out_dev, config.idevice);
			if (status)
			{
				fprintf(stderr, "Cannot execute pattern 2d by process %d, status = %d\n",
					pid, status);
				return status;
			}

			// Copy output data from device buffer.
			memcpy(config.inout_cpu, config.out_dev, size);

			// Sync with changed content in shared region.
			int msync_status = msync(inout, size * (ndevices + 1), MS_SYNC);
			if (msync_status == -1)
			{
				fprintf(stderr, "Cannot sync shared memory %p, errno = %d\n",
					inout, errno);
				return errno;
			}

			int sem_status = sem_post(sem1);
			if (sem_status == -1)
			{
				fprintf(stderr, "Cannot post on semaphore #1 by process %d, errno = %d\n",
					pid, errno);
				return errno;
			}	

			sem_status = sem_wait(sem2);
			if (sem_status == -1)
			{
				fprintf(stderr, "Cannot post on semaphore #2 by process %d, errno = %d\n",
					pid, errno);
				return errno;
			}	
		}
		else
		{
			// Copy input data to device buffer.
			cuda_status = cudaMemcpy(config.in_dev, config.inout_cpu, size,
				cudaMemcpyHostToDevice);
			if (cuda_status != cudaSuccess)
			{
				fprintf(stderr, "Cannot copy input data to CUDA buffer by process %d, status = %d\n",
					pid, cuda_status);
				return cuda_status;
			}

			status = pattern2d_gpu(1, config.nx, 1, 1, config.ny, 1,
				config.in_dev, config.out_dev, config.idevice);
			if (status)
			{
				fprintf(stderr, "Cannot execute pattern 2d by process %d, status = %d\n",
					pid, status);
				return status;
			}

			// Copy output data from device buffer.
			cuda_status = cudaMemcpy(config.inout_cpu, config.out_dev, size,
				cudaMemcpyDeviceToHost);
			if (cuda_status != cudaSuccess)
			{
				fprintf(stderr, "Cannot copy output data from CUDA buffer by process %d, status = %d\n",
					pid, cuda_status);
				return cuda_status;
			}

			// Sync with changed content in shared region.
			int msync_status = msync(inout, size * (ndevices + 1), MS_SYNC);
			if (msync_status == -1)
			{
				fprintf(stderr, "Cannot sync shared memory %p, errno = %d\n",
					inout, errno);
				return errno;
			}
			
			int sem_status = sem_wait(sem1);
			if (sem_status == -1)
			{
				fprintf(stderr, "Cannot wait on semaphore #1 by process %d, errno = %d\n",
					pid, errno);
				return errno;
			}			

			sem_status = sem_post(sem2);
			if (sem_status == -1)
			{
				fprintf(stderr, "Cannot post on semaphore #2 by process %d, errno = %d\n",
					pid, errno);
				return errno;
			}
		}

		// At this point two processes are synchronized.

		config.step++;
		
		// Reassign porcesses' input data segments to show some
		// possible manipulation on shared memory.
		// Here we perform cyclic shift of data pointers.
		config.idevice++;
		config.idevice %= ndevices + 1;
		config.inout_cpu = inout +  config.idevice * np;
	}

	// Release device buffers.
	if (worker)
	{
		cuda_status = cudaFree(config.in_dev);
		if (cuda_status != cudaSuccess)
		{
			fprintf(stderr, "Cannot release input buffer by process %d, status = %d\n",
				pid, cuda_status);
			return cuda_status;
		}
		cuda_status = cudaFree(config.out_dev);
		if (cuda_status != cudaSuccess)
		{
			fprintf(stderr, "Cannot release output buffer by process %d, status = %d\n",
				pid, cuda_status);
			return cuda_status;
		}
	}
	else
	{
		free(config.in_dev);
		free(config.out_dev);
	}
	
	printf("Device %d deinitialized py process %d\n", config.idevice, pid);

	// On master process perform results check:
	// compare each GPU result to CPU result.
	if (master)
	{
		float* control = inout + np * ndevices;
		for (int idevice = 0; idevice < ndevices; idevice++)
		{
			// Find the maximum abs difference.
			int maxi = 0, maxj = 0;
			float maxdiff = fabs(control[0] - (inout + idevice * np)[0]);
			for (int j = 0; j < ny; j++)
			{
				for (int i = 0; i < nx; i++)
				{
					float diff = fabs(
						control[i + j * nx] -
						(inout + idevice * np)[i + j * nx]);
					if (diff > maxdiff)
					{
						maxdiff = diff;
						maxi = i; maxj = j;
					}
				}
			}
			printf("Device %d result abs max diff = %f @ (%d,%d)\n",
				idevice, maxdiff, maxi, maxj);
		}
	}
	
	// Unlink semaphore.
	if (master)
	{
		int sem_status = sem_unlink("/shmem_mmap_cuda_sem1");
		if (sem_status == -1)
		{
			fprintf(stderr, "Cannot unlink semaphore #1 by process %d, errno = %d\n",
				pid, errno);
			return errno;
		}
	}
	
	// Close semaphore.
	int sem_status = sem_close(sem1);
	if (sem_status == -1)
	{
		fprintf(stderr, "Cannot close semaphore #1 by process %d, errno = %d\n",
			pid, errno);
		return errno;
	}

	// Unlink semaphore.
	if (master)
	{
		int sem_status = sem_unlink("/shmem_mmap_cuda_sem2");
		if (sem_status == -1)
		{
			fprintf(stderr, "Cannot unlink semaphore #2 by process %d, errno = %d\n",
				pid, errno);
			return errno;
		}
	}
	
	// Close semaphore.
	sem_status = sem_close(sem2);
	if (sem_status == -1)
	{
		fprintf(stderr, "Cannot close semaphore #2 by process %d, errno = %d\n",
			pid, errno);
		return errno;
	}

	// Unmap shared region.
	close(fd);
	int munmap_status = munmap(inout, size * (ndevices + 1));
	if (munmap_status == -1)
	{
		fprintf(stderr, "Cannot unmap shared region by process %d, errno = %d\n",
			pid, errno);
		return errno;
	}
	
	// Unlink shared region.
	if (master)
	{
		int unlink_status = shm_unlink("/shmem_mmap_cuda_shm");
		if (unlink_status == -1)
		{
			fprintf(stderr, "Cannot unlink shared region by process %d, errno = %d\n",
				pid, errno);
			return errno;
		}
	}
	
	return 0;
}
int main(int argc, char* argv[])
{
	// Initialize MPI. From this point the specified
	// number of processes will be executed in parallel.
	int mpi_status = MPI_Init(&argc, &argv);
	int mpi_error_msg_length;
	char mpi_error_msg[MPI_MAX_ERROR_STRING];
	if (mpi_status != MPI_SUCCESS)
	{
		MPI_Error_string(mpi_status, mpi_error_msg, &mpi_error_msg_length);
		fprintf(stderr, "Cannot initialize MPI, status = %s\n",
			mpi_error_msg);
		return 1;
	}
	
	// Get the size of the MPI global communicator,
	// that is get the total number of MPI processes.
	int nprocesses;
	mpi_status = MPI_Comm_size(MPI_COMM_WORLD, &nprocesses);
	if (mpi_status != MPI_SUCCESS)
	{
		MPI_Error_string(mpi_status, mpi_error_msg, &mpi_error_msg_length);
		fprintf(stderr, "Cannot retrieve the number of MPI processes, status = %s\n",
			mpi_error_msg);
		return 1;
	}
	
	// Get the rank (index) of the current MPI process
	// in the global communicator.
	int iprocess;
	mpi_status = MPI_Comm_rank(MPI_COMM_WORLD, &iprocess);
	if (mpi_status != MPI_SUCCESS)
	{
		MPI_Error_string(mpi_status, mpi_error_msg, &mpi_error_msg_length);
		fprintf(stderr, "Cannot retrieve the rank of current MPI process, status = %s\n",
			mpi_error_msg);
		return 1;
	}

	int ndevices = 0;
	cudaError_t cuda_status = cudaGetDeviceCount(&ndevices);
	if (cuda_status != cudaSuccess)
	{
		fprintf(stderr, "Cannot get the cuda device count by process %d, status = %s\n",
			iprocess, cudaGetErrorString(cuda_status));
		return 1;
	}

	// Return if no cuda devices present.
	if (iprocess == 0)
		printf("%d CUDA device(s) found\n", ndevices);
	if (!ndevices) return 0;

	// Get problem size from the command line.
	if (argc != 3)
	{
		printf("Usage: %s <n> <npasses>\n", argv[0]);
		return 0;
	}
	
	int n = atoi(argv[1]);
	int npasses = atoi(argv[2]);
	size_t size = n * n * sizeof(float);
	
	if ((n <= 0) || (npasses <= 0)) return 0;

        // Assign unique device to each MPI process.
	cuda_status = cudaSetDevice(iprocess);
	if (cuda_status != cudaSuccess)
	{
		fprintf(stderr, "Cannot set CUDA device by process %d, status= %s\n",
			iprocess, cudaGetErrorString(cuda_status));
		return 1;
	}

	// Create two device input buffers.
	float *din1, *din2;
	cuda_status = cudaMalloc((void**)&din1, size);
	if (cuda_status != cudaSuccess)
	{
		fprintf(stderr, "Cannot allocate input device buffer by process %d, status = %s\n",
			iprocess, cudaGetErrorString(cuda_status));
		return 1;
	}
	cuda_status = cudaMalloc((void**)&din2, size);
	if (cuda_status != cudaSuccess)
	{
		fprintf(stderr, "Cannot allocate input device buffer by process %d, status = %s\n",
			iprocess, cudaGetErrorString(cuda_status));
		return 1;
	}
	
	// Create device output buffer.
	float* dout;
	cuda_status = cudaMalloc((void**)&dout, size);
	if (cuda_status != cudaSuccess)
	{
		fprintf(stderr, "Cannot allocate output device buffer by process %d, status = %s\n",
			iprocess, cudaGetErrorString(cuda_status));
		return 1;
	}

	float* hin = (float*)malloc(size);
	float* hout = (float*)malloc(size);
	
	// Generate random input data.
	double dinvrmax = 1.0 / RAND_MAX;
	for (int i = 0; i < n * n; i++)
	{
		for (int j = 0; j < iprocess + 1; j++)
			hin[i] += rand() * dinvrmax;
		hin[i] /= iprocess + 1;
	}
	
	// Copy input data generated on host to device buffer.
	cuda_status = cudaMemcpy(din1, hin, size, cudaMemcpyHostToDevice); 
	if (cuda_status != cudaSuccess)
	{
		fprintf(stderr, "Cannot copy input data from host to device by process %d, status = %s\n",
			iprocess, cudaGetErrorString(cuda_status));
		return 1;
	}
	
	// Perform the specified number of processing passes.
	for (int ipass = 0; ipass < npasses; ipass++)
	{
		// Fill output device buffer will zeros.
		cuda_status = cudaMemset(dout, 0, size);
		if (cuda_status != cudaSuccess)
		{
			fprintf(stderr, "Cannot fill output device buffer with zeros by process %d, status = %s\n",
				iprocess, cudaGetErrorString(cuda_status));
			return 1;
		}

		// Process data on GPU.
		pattern2d_gpu(1, n, 1, 1, n, 1, din1, dout);

		// Wait for GPU kernels to finish processing.
		cuda_status = cudaThreadSynchronize();
		if (cuda_status != cudaSuccess)
		{
			fprintf(stderr, "Cannot synchronize GPU kernel by process %d, status = %s\n",
				iprocess, cudaGetErrorString(cuda_status));
			return 1;
		}
	
		// Copy output data back from device to host.
		cuda_status = cudaMemcpy(hout, dout, size, cudaMemcpyDeviceToHost); 
		if (cuda_status != cudaSuccess)
		{
			fprintf(stderr, "Cannot copy output data from device to host by process %d, status = %s\n",
				iprocess, cudaGetErrorString(cuda_status));
			return 1;
		}
	
		// Output average value of the resulting field.
		float avg = 0.0;
		for (int i = 0; i < n * n; i++)
			avg += hout[i];
		avg /= n * n;
		printf("Sending process %d resulting field with average = %f to process %d\n",
			iprocess, avg, (iprocess + 1) % nprocesses);

		MPI_Request request;
                int inext = (iprocess + 1) % nprocesses;
                int iprev = (iprocess - 1) % nprocesses; iprev += (iprev < 0) ? nprocesses : 0;
	
		// Pass entire process input device buffer directly to input device buffer
		// of next process.
		mpi_status = MPI_Isend(din1, n * n, MPI_FLOAT, inext, 0, MPI_COMM_WORLD, &request);
		mpi_status = MPI_Recv(din2, n * n, MPI_FLOAT, iprev, 0,	MPI_COMM_WORLD, NULL);
		mpi_status = MPI_Wait(&request, MPI_STATUS_IGNORE);
		
		// Swap buffers.
		float* swap = din1; din1 = din2; din2 = swap;
	}
	
	cuda_status = cudaFree(din1);
	if (cuda_status != cudaSuccess)
	{
		fprintf(stderr, "Cannot free input device buffer by process %d, status = %s\n",
			iprocess, cudaGetErrorString(cuda_status));
		return 1;
	}
	
	cuda_status = cudaFree(dout);
	if (cuda_status != cudaSuccess)
	{
		fprintf(stderr, "Cannot free output device buffer by process %d, status = %s\n",
			iprocess, cudaGetErrorString(cuda_status));
		return 1;
	}

	free(hin);
	free(hout);

	mpi_status = MPI_Finalize();
	if (mpi_status != MPI_SUCCESS)
	{
		MPI_Error_string(mpi_status, mpi_error_msg, &mpi_error_msg_length);
		fprintf(stderr, "Cannot finalize MPI, status = %s\n",
			mpi_error_msg);
		return 1;
	}
	
	return 0;
}