static void exec_gaussblur_app(DATA *image_out, DATA *image_in, ARGUMENTS *settings) { size_t size_in_bytes = settings->size*settings->size*(PIXEL_CHANNELS*sizeof(DATA)); DATA *image_in_gpu = (DATA *)acc_malloc(size_in_bytes); DATA *image_out_gpu = (DATA *)acc_malloc(size_in_bytes); uint32_t e; uint32_t i; for (e=0; e<settings->energy_loops; ++e) { acc_memcpy_to_device(image_in_gpu , image_in , size_in_bytes); acc_memcpy_to_device(image_out_gpu, image_out, size_in_bytes); for (i=0; i<settings->checkpoints; ++i) { gaussblur_calc_gpu_compute((uint16_t *)image_in_gpu , (uint16_t *)image_out_gpu, settings->size); gaussblur_calc_gpu_compute((uint16_t *)image_out_gpu, (uint16_t *)image_in_gpu , settings->size); } acc_memcpy_from_device(image_in , image_in_gpu , size_in_bytes); acc_memcpy_from_device(image_out, image_out_gpu, size_in_bytes); } acc_free(image_in_gpu); acc_free(image_out_gpu); image_in_gpu = NULL; image_out_gpu = NULL; }
void GPUCopy::copyIn(SimBox *sb) { h_moleculeData = sb->moleculeData; h_atomData = sb->atomData; h_atomCoordinates = sb->atomCoordinates; h_rollBackCoordinates = sb->rollBackCoordinates; h_size = sb-> size; h_primaryIndexes = sb->primaryIndexes; if (!parallel) { return; } #ifdef _OPENACC d_moleculeData = (int**)acc_malloc(MOL_DATA_SIZE * sizeof(int *)); assert(d_moleculeData != NULL); for (int row = 0; row < MOL_DATA_SIZE; row++) { int *h_moleculeData_row = sb->moleculeData[row]; int *d_moleculeData_row = (int *)acc_copyin(h_moleculeData_row, sb->numMolecules * sizeof(int)); assert(d_moleculeData_row != NULL); #pragma acc parallel deviceptr(d_moleculeData) d_moleculeData[row] = d_moleculeData_row; } d_atomData = (Real**)acc_malloc(ATOM_DATA_SIZE * sizeof(Real *)); assert(d_atomData != NULL); for (int row = 0; row < ATOM_DATA_SIZE; row++) { Real *h_atomData_row = sb->atomData[row]; Real *d_atomData_row = (Real *)acc_copyin(h_atomData_row, sb->numAtoms * sizeof(Real)); assert(d_atomData_row != NULL); #pragma acc parallel deviceptr(d_atomData) d_atomData[row] = d_atomData_row; } d_atomCoordinates = (Real**)acc_malloc(NUM_DIMENSIONS * sizeof(Real *)); assert(d_atomCoordinates != NULL); for (int row = 0; row < NUM_DIMENSIONS; row++) { Real *h_atomCoordinates_row = sb->atomCoordinates[row]; Real *d_atomCoordinates_row = (Real *)acc_copyin(h_atomCoordinates_row, sb->numAtoms * sizeof(Real)); assert(d_atomCoordinates_row != NULL); #pragma acc parallel deviceptr(d_atomCoordinates) d_atomCoordinates[row] = d_atomCoordinates_row; } d_rollBackCoordinates = (Real**)acc_malloc(NUM_DIMENSIONS * sizeof(Real *)); assert(d_rollBackCoordinates != NULL); for (int row = 0; row < NUM_DIMENSIONS; row++) { Real *h_rollBackCoordinates_row = sb->rollBackCoordinates[row]; Real *d_rollBackCoordinates_row = (Real *)acc_copyin(h_rollBackCoordinates_row, sb->largestMol * sizeof(Real)); assert(d_rollBackCoordinates_row != NULL); #pragma acc parallel deviceptr(d_rollBackCoordinates) d_rollBackCoordinates[row] = d_rollBackCoordinates_row; } d_primaryIndexes = (int *)acc_copyin(sb->primaryIndexes, sb->numPIdxes * sizeof(int)); d_size = (Real *)acc_copyin(sb->size, NUM_DIMENSIONS * sizeof(Real)); #endif }
int main (int argc, char **argv) { void *d; acc_device_t devtype = acc_device_host; #if ACC_DEVICE_TYPE_nvidia devtype = acc_device_nvidia; if (acc_get_num_devices (acc_device_nvidia) == 0) return 0; #endif acc_init (devtype); d = acc_malloc (0); if (d != NULL) abort (); acc_free (0); acc_shutdown (devtype); acc_set_device_type (devtype); d = acc_malloc (0); if (d != NULL) abort (); acc_shutdown (devtype); acc_init (devtype); d = acc_malloc (1024); if (d == NULL) abort (); acc_free (d); acc_shutdown (devtype); acc_set_device_type (devtype); d = acc_malloc (1024); if (d == NULL) abort (); acc_free (d); acc_shutdown (devtype); return 0; }
int allocate_device_buffer (char ** buffer) { #ifdef _ENABLE_CUDA_ cudaError_t cuerr = cudaSuccess; #endif switch (options.accel) { #ifdef _ENABLE_CUDA_ case cuda: cuerr = cudaMalloc((void **)buffer, MYBUFSIZE); if (cudaSuccess != cuerr) { fprintf(stderr, "Could not allocate device memory\n"); return 1; } break; #endif #ifdef _ENABLE_OPENACC_ case openacc: *buffer = acc_malloc(MYBUFSIZE); if (NULL == *buffer) { fprintf(stderr, "Could not allocate device memory\n"); return 1; } break; #endif default: fprintf(stderr, "Could not allocate device memory\n"); return 1; } return 0; }
int allocate_buffer (void ** buffer, size_t size, enum accel_type type) { if (options.target == cpu || options.target == both) { allocate_host_arrays(); } size_t alignment = sysconf(_SC_PAGESIZE); #ifdef _ENABLE_CUDA_ cudaError_t cuerr = cudaSuccess; #endif switch (type) { case none: return posix_memalign(buffer, alignment, size); #ifdef _ENABLE_CUDA_ case cuda: cuerr = cudaMalloc(buffer, size); if (cudaSuccess != cuerr) { return 1; } else { return 0; } case managed: cuerr = cudaMallocManaged(buffer, size, cudaMemAttachGlobal); if (cudaSuccess != cuerr) { return 1; } else { return 0; } #endif #ifdef _ENABLE_OPENACC_ case openacc: *buffer = acc_malloc(size); if (NULL == *buffer) { return 1; } else { return 0; } #endif default: return 1; } }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; acc_init (acc_device_nvidia); h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_malloc (N); acc_memcpy_to_device (d, h, N); memset (&h[0], 0, N); acc_memcpy_to_device (d, h, N << 1); acc_memcpy_from_device (h, d, N); for (i = 0; i < N; i++) { if (h[i] != i) abort (); } acc_free (d); free (h); acc_shutdown (acc_device_nvidia); return 0; }
int main (int argc, char **argv) { const int N = 256; unsigned char *h; void *d; h = (unsigned char *) malloc (N); d = acc_malloc (N); fprintf (stderr, "CheCKpOInT\n"); acc_map_data (h, d, 0); acc_unmap_data (h); acc_free (d); free (h); return 0; }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_malloc (N); fprintf (stderr, "CheCKpOInT\n"); acc_memcpy_to_device (0, h, N); memset (&h[0], 0, N); acc_memcpy_from_device (h, d, N); for (i = 0; i < N; i++) { if (h[i] != i) abort (); } acc_free (d); free (h); return 0; }
int main (int argc, char **argv) { const int N = 256; unsigned char *h; void *d; h = (unsigned char *) malloc (N); d = acc_malloc (N); acc_map_data (h, d, N); if (acc_is_present (h, N) != 1) abort (); acc_unmap_data (h); acc_free (d); free (h); return 0; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay; CUmodule module; CUresult r; CUstream stream; unsigned long *a, *d_a, dticks; int nbytes; float atime, dtime; void *kargs[2]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay, module, "delay"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = nprocs * sizeof (unsigned long); dtime = 200.0; dticks = (unsigned long) (dtime * clkrate); a = (unsigned long *) malloc (nbytes); d_a = (unsigned long *) acc_malloc (nbytes); acc_map_data (a, d_a, nbytes); kargs[0] = (void *) &d_a; kargs[1] = (void *) &dticks; r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } acc_set_cuda_stream (0, stream); init_timers (1); start_timer (0); r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } acc_wait (1); atime = stop_timer (0); if (atime < dtime) { fprintf (stderr, "actual time < delay time\n"); abort (); } start_timer (0); acc_wait (1); atime = stop_timer (0); if (0.010 < atime) { fprintf (stderr, "actual time < delay time\n"); abort (); } acc_unmap_data (a); fini_timers (); free (a); acc_free (d_a); acc_shutdown (acc_device_nvidia); return 0; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay; CUmodule module; CUresult r; CUstream stream; unsigned long *a, *d_a, dticks; int nbytes; float dtime; void *kargs[2]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay, module, "delay"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = nprocs * sizeof (unsigned long); dtime = 200.0; dticks = (unsigned long) (dtime * clkrate); a = (unsigned long *) malloc (nbytes); d_a = (unsigned long *) acc_malloc (nbytes); acc_map_data (a, d_a, nbytes); kargs[0] = (void *) &d_a; kargs[1] = (void *) &dticks; r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (0, stream)) abort (); r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } if (acc_async_test_all () != 0) { fprintf (stderr, "asynchronous operation not running\n"); abort (); } sleep ((int) (dtime / 1000.f) + 1); if (acc_async_test_all () != 1) { fprintf (stderr, "found asynchronous operation still running\n"); abort (); } acc_unmap_data (a); free (a); acc_free (d_a); acc_shutdown (acc_device_nvidia); exit (0); }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay2; CUmodule module; CUresult r; int N; int i; CUstream *streams; unsigned long **a, **d_a, *tid, ticks; int nbytes; void *kargs[3]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay2, module, "delay2"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = sizeof (int); ticks = (unsigned long) (200.0 * clkrate); N = nprocs; streams = (CUstream *) malloc (N * sizeof (void *)); a = (unsigned long **) malloc (N * sizeof (unsigned long *)); d_a = (unsigned long **) malloc (N * sizeof (unsigned long *)); tid = (unsigned long *) malloc (N * sizeof (unsigned long)); for (i = 0; i < N; i++) { a[i] = (unsigned long *) malloc (sizeof (unsigned long)); *a[i] = N; d_a[i] = (unsigned long *) acc_malloc (nbytes); tid[i] = i; acc_map_data (a[i], d_a[i], nbytes); streams[i] = (CUstream) acc_get_cuda_stream (i); if (streams[i] != NULL) abort (); r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (i, streams[i])) abort (); } for (i = 0; i < N; i++) { kargs[0] = (void *) &d_a[i]; kargs[1] = (void *) &ticks; kargs[2] = (void *) &tid[i]; r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } ticks = (unsigned long) (50.0 * clkrate); } acc_wait_all_async (0); for (i = 0; i < N; i++) { acc_copyout (a[i], nbytes); if (*a[i] != i) abort (); } free (streams); for (i = 0; i < N; i++) { free (a[i]); } free (a); free (d_a); free (tid); acc_shutdown (acc_device_nvidia); exit (0); }