// Compute 2d pattern on each device. int process( coaccel_device_group group, coaccel_device device, int idevice, int ithread, void* arg) { // Unpack my personal config structure. config_t* config = (config_t*)arg + idevice; // Switch callee, depending on the current device mode. switch (coaccel_device_get_mode(device)) { case COACCEL_DEVMODE_CUDA_SYNC : coaccel_device_lock(device); pattern2d_gpu(1, config->nx, 1, 1, config->ny, 1, config->in_dev, config->out_dev, idevice); coaccel_device_unlock(device); break; case COACCEL_DEVMODE_CPU_SYNC : pattern2d_cpu(1, config->nx, 1, 1, config->ny, 1, config->in_dev, config->out_dev, idevice); } config->step++; // Swap device input and output buffers. float* swap = config->in_dev; config->in_dev = config->out_dev; config->out_dev = swap; printf("Device %d completed step %d\n", idevice, config->step); return 0; }
int main(int argc, char* atgv[]) { int ndevices = 0; cudaError_t cuda_status = cudaGetDeviceCount(&ndevices); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot get the cuda device count, status = %d: %s\n", cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } // Return if no cuda devices present. printf("%d CUDA device(s) found\n", ndevices); if (!ndevices) return 0; // Create input data. Each device will have an equal // piece of data. size_t np = nx * ny, size = np * sizeof(float); float* data = (float*)malloc(size * 2); float *input = data, *output = data + np; float invdrandmax = 1.0 / RAND_MAX; for (size_t i = 0; i < np; i++) input[i] = rand() * invdrandmax; struct time_t start, finish; get_time(&start); // Get control result on CPU (to compare with results on devices). pattern2d_cpu(1, nx, 1, 1, ny, 1, input, output, ndevices); get_time(&finish); printf("CPU time = %f sec\n", get_time_diff(&start, &finish)); // Create config structures to store device-specific // values. config_t* configs = (config_t*)malloc( sizeof(config_t) * ndevices); // Initialize CUDA devices. for (int idevice = 0; idevice < ndevices; idevice++) { config_t* config = configs + idevice; // TODO: Set curent CUDA device to idevice. cudaError_t cuda_status = cudaSetDevice(idevice); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot get the cuda device count, status = %d: %s\n", cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } // Create device arrays for input and output data. cuda_status = cudaMalloc((void**)&config->in_dev, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot allocate CUDA input buffer on device %d, status = %d: %s\n", idevice, cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } cuda_status = cudaMalloc((void**)&config->out_dev, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot allocate CUDA output buffer on device %d, status = %d: %s\n", idevice, cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } // Copy input data to device buffer. cuda_status = cudaMemcpy(config->in_dev, input, size, cudaMemcpyHostToDevice); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot copy input data to CUDA buffer on device %d, status = %d: %s\n", idevice, cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } printf("Device %d initialized\n", idevice); } // Start execution of kernels. One kernel // is executed on each device in parallel. for (int idevice = 0; idevice < ndevices; idevice++) { config_t* config = configs + idevice; // TODO: Set curent CUDA device to idevice. cudaError_t cuda_status = cudaSetDevice(idevice); get_time(&config->start); // Run test kernel on the current device. int status = pattern2d_gpu(1, nx, 1, 1, ny, 1, config->in_dev, config->out_dev, idevice); if (status) { fprintf(stderr, "Cannot execute pattern 2d on device %d, status = %d: %s\n", idevice, status, cudaGetErrorString(status)); return status; } } // Synchronize kernels execution. for (int idevice = 0; idevice < ndevices; idevice++) { config_t* config = configs + idevice; // TODO: Set curent CUDA device to idevice. cudaError_t cuda_status = cudaSetDevice(idevice); // Wait for current device to finish processing // the kernels. cuda_status = cudaThreadSynchronize(); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot synchronize thread by device %d, status = %d: %s\n", idevice, cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } get_time(&finish); printf("GPU %d time = %f sec\n", idevice, get_time_diff(&config->start, &finish)); } // Check results and dispose resources used by devices. for (int idevice = 0; idevice < ndevices; idevice++) { config_t* config = configs + idevice; // TODO: Set curent CUDA device to idevice. // Offload results back to host memory. cuda_status = cudaMemcpy(input, config->out_dev, size, cudaMemcpyDeviceToHost); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot copy output data from CUDA buffer on device %d, status = %d: %s\n", idevice, cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } // Free device arrays. cuda_status = cudaFree(config->in_dev); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot release input buffer on device %d, status = %d: %s\n", idevice, cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } cuda_status = cudaFree(config->out_dev); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot release output buffer on device %d, status = %d: %s\n", idevice, cuda_status, cudaGetErrorString(cuda_status)); return cuda_status; } printf("Device %d deinitialized\n", idevice); // Compare each GPU result to CPU result. // Find the maximum abs difference. int maxi = 0, maxj = 0; float maxdiff = fabs(input[0] - output[0]); for (int j = 0; j < ny; j++) { for (int i = 0; i < nx; i++) { float diff = fabs( input[i + j * nx] - output[i + j * nx]); if (diff > maxdiff) { maxdiff = diff; maxi = i; maxj = j; } } } printf("Device %d result abs max diff = %f @ (%d,%d)\n", idevice, maxdiff, maxi, maxj); } // Measure time between first GPU launch and last GPU // finish. This will show how much time is spent on GPU // kernels in total. // XXX If this time is comparabe to the time of // individual GPU, then we likely reached our goal: // kernels are executed in parallel. printf("Total time of %d GPUs = %f\n", ndevices, get_time_diff(&configs[0].start, &finish)); free(configs); free(data); return 0; }
int main(int argc, char* argv[]) { // Process config (to be filled completely // later). config_t config; config.idevice = 0; config.nx = nx; config.ny = ny; config.step = 0; // Create shared memory region. int fd = shm_open("/shmem_mmap_cuda_shm", O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); if (fd == -1) { fprintf(stderr, "Cannot open shared region, errno = %d\n", errno); return errno; } // Create first semaphore (set to 0 to create it initially locked). sem_t* sem1 = sem_open("/shmem_mmap_cuda_sem1", O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO, 0); if (sem1 == SEM_FAILED) { fprintf(stderr, "Cannot open semaphore #1, errno = %d\n", errno); return errno; } // Create second semaphore (set to 0 to create it initially locked). sem_t* sem2 = sem_open("/shmem_mmap_cuda_sem2", O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO, 0); if (sem2 == SEM_FAILED) { fprintf(stderr, "Cannot open semaphore #2, errno = %d\n", errno); return errno; } // Call fork to create another process. // Standard: "Memory mappings created in the parent // shall be retained in the child process." pid_t fork_status = fork(); // From this point two processes are running the same code, if no errors. if (fork_status == -1) { fprintf(stderr, "Cannot fork process, errno = %d\n", errno); return errno; } // Get the process ID. int pid = (int)getpid(); // By fork return value we can determine the process role: // master or child (worker). int master = fork_status ? 1 : 0, worker = !master; int ndevices = 0; cudaError_t cuda_status = cudaGetDeviceCount(&ndevices); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot get the cuda device count by process %d, status = %d\n", pid, cuda_status); return cuda_status; } // Return if no cuda devices present. if (master) printf("%d CUDA device(s) found\n", ndevices); if (!ndevices) return 0; ndevices = 1; size_t np = nx * ny; size_t size = np * sizeof(float); float* inout; if (!master) { // Lock semaphore to finish shared region configuration on master. int sem_status = sem_wait(sem1); if (sem_status == -1) { fprintf(stderr, "Cannot wait on semaphore by process %d, errno = %d\n", pid, errno); return errno; } // Map the shared region into the address space of the current process. inout = (float*)mmap(0, size * (ndevices + 1), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (inout == MAP_FAILED) { fprintf(stderr, "Cannot map shared region to memory by process %d, errno = %d\n", pid, errno); return errno; } } else { config.idevice = ndevices; // Set shared region size. int ftrunk_status = ftruncate(fd, size * (ndevices + 1)); if (ftrunk_status == -1) { fprintf(stderr, "Cannot truncate shared region, errno = %d\n", errno); return errno; } // Map the shared region into the address space of the current process. inout = (float*)mmap(0, size * (ndevices + 1), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (inout == MAP_FAILED) { fprintf(stderr, "Cannot map shared region to memory by process %d, errno = %d\n", pid, errno); return errno; } // Create input data. Let each device to have an equal piece // of single shared data array. float invdrandmax = 1.0 / RAND_MAX; for (size_t i = 0; i < np; i++) inout[i] = rand() * invdrandmax; for (int i = 0; i < ndevices; i++) memcpy(inout + np * (i + 1), inout, np * sizeof(float)); // Sync changed content with shared region. int msync_status = msync(inout, size * (ndevices + 1), MS_SYNC); if (msync_status == -1) { fprintf(stderr, "Cannot sync shared memory %p, errno = %d\n", inout, errno); return errno; } // Unlock semaphore to let other processes to move forward. int sem_status = sem_post(sem1); if (sem_status == -1) { fprintf(stderr, "Cannot post on semaphore by process %d, errno = %d\n", pid, errno); return errno; } } config.inout_cpu = inout + config.idevice * np; // Let workers to use CUDA devices, and master - the CPU. // Create device buffers. if (worker) { // Create device arrays for input and output data. cuda_status = cudaMalloc((void**)&config.in_dev, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot allocate CUDA input buffer by process %d, status = %d\n", pid, cuda_status); return cuda_status; } cuda_status = cudaMalloc((void**)&config.out_dev, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot allocate CUDA output buffer by process %d, status = %d\n", pid, cuda_status); return cuda_status; } } else { // Create device arrays for input and output data. config.in_dev = (float*)malloc(size); config.out_dev = (float*)malloc(size); } printf("Device %d initialized py process %d\n", config.idevice, pid); // Perform some "iterations" on data arrays, assigned to devices, // and shift input data array after each iteration. for (int i = 0; i < nticks; i++) { int status; if (master) { // Copy input data to device buffer. memcpy(config.in_dev, config.inout_cpu, size); status = pattern2d_cpu(1, config.nx, 1, 1, config.ny, 1, config.in_dev, config.out_dev, config.idevice); if (status) { fprintf(stderr, "Cannot execute pattern 2d by process %d, status = %d\n", pid, status); return status; } // Copy output data from device buffer. memcpy(config.inout_cpu, config.out_dev, size); // Sync with changed content in shared region. int msync_status = msync(inout, size * (ndevices + 1), MS_SYNC); if (msync_status == -1) { fprintf(stderr, "Cannot sync shared memory %p, errno = %d\n", inout, errno); return errno; } int sem_status = sem_post(sem1); if (sem_status == -1) { fprintf(stderr, "Cannot post on semaphore #1 by process %d, errno = %d\n", pid, errno); return errno; } sem_status = sem_wait(sem2); if (sem_status == -1) { fprintf(stderr, "Cannot post on semaphore #2 by process %d, errno = %d\n", pid, errno); return errno; } } else { // Copy input data to device buffer. cuda_status = cudaMemcpy(config.in_dev, config.inout_cpu, size, cudaMemcpyHostToDevice); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot copy input data to CUDA buffer by process %d, status = %d\n", pid, cuda_status); return cuda_status; } status = pattern2d_gpu(1, config.nx, 1, 1, config.ny, 1, config.in_dev, config.out_dev, config.idevice); if (status) { fprintf(stderr, "Cannot execute pattern 2d by process %d, status = %d\n", pid, status); return status; } // Copy output data from device buffer. cuda_status = cudaMemcpy(config.inout_cpu, config.out_dev, size, cudaMemcpyDeviceToHost); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot copy output data from CUDA buffer by process %d, status = %d\n", pid, cuda_status); return cuda_status; } // Sync with changed content in shared region. int msync_status = msync(inout, size * (ndevices + 1), MS_SYNC); if (msync_status == -1) { fprintf(stderr, "Cannot sync shared memory %p, errno = %d\n", inout, errno); return errno; } int sem_status = sem_wait(sem1); if (sem_status == -1) { fprintf(stderr, "Cannot wait on semaphore #1 by process %d, errno = %d\n", pid, errno); return errno; } sem_status = sem_post(sem2); if (sem_status == -1) { fprintf(stderr, "Cannot post on semaphore #2 by process %d, errno = %d\n", pid, errno); return errno; } } // At this point two processes are synchronized. config.step++; // Reassign porcesses' input data segments to show some // possible manipulation on shared memory. // Here we perform cyclic shift of data pointers. config.idevice++; config.idevice %= ndevices + 1; config.inout_cpu = inout + config.idevice * np; } // Release device buffers. if (worker) { cuda_status = cudaFree(config.in_dev); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot release input buffer by process %d, status = %d\n", pid, cuda_status); return cuda_status; } cuda_status = cudaFree(config.out_dev); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot release output buffer by process %d, status = %d\n", pid, cuda_status); return cuda_status; } } else { free(config.in_dev); free(config.out_dev); } printf("Device %d deinitialized py process %d\n", config.idevice, pid); // On master process perform results check: // compare each GPU result to CPU result. if (master) { float* control = inout + np * ndevices; for (int idevice = 0; idevice < ndevices; idevice++) { // Find the maximum abs difference. int maxi = 0, maxj = 0; float maxdiff = fabs(control[0] - (inout + idevice * np)[0]); for (int j = 0; j < ny; j++) { for (int i = 0; i < nx; i++) { float diff = fabs( control[i + j * nx] - (inout + idevice * np)[i + j * nx]); if (diff > maxdiff) { maxdiff = diff; maxi = i; maxj = j; } } } printf("Device %d result abs max diff = %f @ (%d,%d)\n", idevice, maxdiff, maxi, maxj); } } // Unlink semaphore. if (master) { int sem_status = sem_unlink("/shmem_mmap_cuda_sem1"); if (sem_status == -1) { fprintf(stderr, "Cannot unlink semaphore #1 by process %d, errno = %d\n", pid, errno); return errno; } } // Close semaphore. int sem_status = sem_close(sem1); if (sem_status == -1) { fprintf(stderr, "Cannot close semaphore #1 by process %d, errno = %d\n", pid, errno); return errno; } // Unlink semaphore. if (master) { int sem_status = sem_unlink("/shmem_mmap_cuda_sem2"); if (sem_status == -1) { fprintf(stderr, "Cannot unlink semaphore #2 by process %d, errno = %d\n", pid, errno); return errno; } } // Close semaphore. sem_status = sem_close(sem2); if (sem_status == -1) { fprintf(stderr, "Cannot close semaphore #2 by process %d, errno = %d\n", pid, errno); return errno; } // Unmap shared region. close(fd); int munmap_status = munmap(inout, size * (ndevices + 1)); if (munmap_status == -1) { fprintf(stderr, "Cannot unmap shared region by process %d, errno = %d\n", pid, errno); return errno; } // Unlink shared region. if (master) { int unlink_status = shm_unlink("/shmem_mmap_cuda_shm"); if (unlink_status == -1) { fprintf(stderr, "Cannot unlink shared region by process %d, errno = %d\n", pid, errno); return errno; } } return 0; }
int main(int argc, char* argv[]) { // Initialize MPI. From this point the specified // number of processes will be executed in parallel. int mpi_status = MPI_Init(&argc, &argv); int mpi_error_msg_length; char mpi_error_msg[MPI_MAX_ERROR_STRING]; if (mpi_status != MPI_SUCCESS) { MPI_Error_string(mpi_status, mpi_error_msg, &mpi_error_msg_length); fprintf(stderr, "Cannot initialize MPI, status = %s\n", mpi_error_msg); return 1; } // Get the size of the MPI global communicator, // that is get the total number of MPI processes. int nprocesses; mpi_status = MPI_Comm_size(MPI_COMM_WORLD, &nprocesses); if (mpi_status != MPI_SUCCESS) { MPI_Error_string(mpi_status, mpi_error_msg, &mpi_error_msg_length); fprintf(stderr, "Cannot retrieve the number of MPI processes, status = %s\n", mpi_error_msg); return 1; } // Get the rank (index) of the current MPI process // in the global communicator. int iprocess; mpi_status = MPI_Comm_rank(MPI_COMM_WORLD, &iprocess); if (mpi_status != MPI_SUCCESS) { MPI_Error_string(mpi_status, mpi_error_msg, &mpi_error_msg_length); fprintf(stderr, "Cannot retrieve the rank of current MPI process, status = %s\n", mpi_error_msg); return 1; } int ndevices = 0; cudaError_t cuda_status = cudaGetDeviceCount(&ndevices); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot get the cuda device count by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } // Return if no cuda devices present. if (iprocess == 0) printf("%d CUDA device(s) found\n", ndevices); if (!ndevices) return 0; // Get problem size from the command line. if (argc != 3) { printf("Usage: %s <n> <npasses>\n", argv[0]); return 0; } int n = atoi(argv[1]); int npasses = atoi(argv[2]); size_t size = n * n * sizeof(float); if ((n <= 0) || (npasses <= 0)) return 0; // Assign unique device to each MPI process. cuda_status = cudaSetDevice(iprocess); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot set CUDA device by process %d, status= %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } // Create two device input buffers. float *din1, *din2; cuda_status = cudaMalloc((void**)&din1, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot allocate input device buffer by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } cuda_status = cudaMalloc((void**)&din2, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot allocate input device buffer by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } // Create device output buffer. float* dout; cuda_status = cudaMalloc((void**)&dout, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot allocate output device buffer by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } float* hin = (float*)malloc(size); float* hout = (float*)malloc(size); // Generate random input data. double dinvrmax = 1.0 / RAND_MAX; for (int i = 0; i < n * n; i++) { for (int j = 0; j < iprocess + 1; j++) hin[i] += rand() * dinvrmax; hin[i] /= iprocess + 1; } // Copy input data generated on host to device buffer. cuda_status = cudaMemcpy(din1, hin, size, cudaMemcpyHostToDevice); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot copy input data from host to device by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } // Perform the specified number of processing passes. for (int ipass = 0; ipass < npasses; ipass++) { // Fill output device buffer will zeros. cuda_status = cudaMemset(dout, 0, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot fill output device buffer with zeros by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } // Process data on GPU. pattern2d_gpu(1, n, 1, 1, n, 1, din1, dout); // Wait for GPU kernels to finish processing. cuda_status = cudaThreadSynchronize(); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot synchronize GPU kernel by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } // Copy output data back from device to host. cuda_status = cudaMemcpy(hout, dout, size, cudaMemcpyDeviceToHost); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot copy output data from device to host by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } // Output average value of the resulting field. float avg = 0.0; for (int i = 0; i < n * n; i++) avg += hout[i]; avg /= n * n; printf("Sending process %d resulting field with average = %f to process %d\n", iprocess, avg, (iprocess + 1) % nprocesses); MPI_Request request; int inext = (iprocess + 1) % nprocesses; int iprev = (iprocess - 1) % nprocesses; iprev += (iprev < 0) ? nprocesses : 0; // Pass entire process input device buffer directly to input device buffer // of next process. mpi_status = MPI_Isend(din1, n * n, MPI_FLOAT, inext, 0, MPI_COMM_WORLD, &request); mpi_status = MPI_Recv(din2, n * n, MPI_FLOAT, iprev, 0, MPI_COMM_WORLD, NULL); mpi_status = MPI_Wait(&request, MPI_STATUS_IGNORE); // Swap buffers. float* swap = din1; din1 = din2; din2 = swap; } cuda_status = cudaFree(din1); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot free input device buffer by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } cuda_status = cudaFree(dout); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot free output device buffer by process %d, status = %s\n", iprocess, cudaGetErrorString(cuda_status)); return 1; } free(hin); free(hout); mpi_status = MPI_Finalize(); if (mpi_status != MPI_SUCCESS) { MPI_Error_string(mpi_status, mpi_error_msg, &mpi_error_msg_length); fprintf(stderr, "Cannot finalize MPI, status = %s\n", mpi_error_msg); return 1; } return 0; }