static void exec_gaussblur_app(DATA *image_out, DATA *image_in, ARGUMENTS *settings) { size_t size_in_bytes = settings->size*settings->size*(PIXEL_CHANNELS*sizeof(DATA)); DATA *image_in_gpu = (DATA *)acc_malloc(size_in_bytes); DATA *image_out_gpu = (DATA *)acc_malloc(size_in_bytes); uint32_t e; uint32_t i; for (e=0; e<settings->energy_loops; ++e) { acc_memcpy_to_device(image_in_gpu , image_in , size_in_bytes); acc_memcpy_to_device(image_out_gpu, image_out, size_in_bytes); for (i=0; i<settings->checkpoints; ++i) { gaussblur_calc_gpu_compute((uint16_t *)image_in_gpu , (uint16_t *)image_out_gpu, settings->size); gaussblur_calc_gpu_compute((uint16_t *)image_out_gpu, (uint16_t *)image_in_gpu , settings->size); } acc_memcpy_from_device(image_in , image_in_gpu , size_in_bytes); acc_memcpy_from_device(image_out, image_out_gpu, size_in_bytes); } acc_free(image_in_gpu); acc_free(image_out_gpu); image_in_gpu = NULL; image_out_gpu = NULL; }
int main (int argc, char **argv) { void *d; acc_device_t devtype = acc_device_host; #if ACC_DEVICE_TYPE_nvidia devtype = acc_device_nvidia; if (acc_get_num_devices (acc_device_nvidia) == 0) return 0; #endif acc_init (devtype); d = acc_malloc (0); if (d != NULL) abort (); acc_free (0); acc_shutdown (devtype); acc_set_device_type (devtype); d = acc_malloc (0); if (d != NULL) abort (); acc_shutdown (devtype); acc_init (devtype); d = acc_malloc (1024); if (d == NULL) abort (); acc_free (d); acc_shutdown (devtype); acc_set_device_type (devtype); d = acc_malloc (1024); if (d == NULL) abort (); acc_free (d); acc_shutdown (devtype); return 0; }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_copyin (h, N); acc_free (d); fprintf (stderr, "CheCKpOInT\n"); acc_copyout (h, N); free (h); return 0; }
void free_buffer (void * buffer, enum accel_type type) { switch (type) { case none: free(buffer); break; case managed: case cuda: #ifdef _ENABLE_CUDA_ cudaFree(buffer); #endif break; case openacc: #ifdef _ENABLE_OPENACC_ acc_free(buffer); #endif break; } /* Free dummy compute related resources */ if (is_alloc) { if (options.target == cpu) { free_host_arrays(); } #ifdef _ENABLE_CUDA_KERNEL_ else if (options.target == gpu || options.target == both) { free_host_arrays(); free_device_arrays(); } #endif } is_alloc = 0; }
void free_buffer (void * buffer, enum accel_type type) { switch (type) { case none: free(buffer); break; case managed: case cuda: #ifdef _ENABLE_CUDA_ cudaFree(buffer); #endif break; case openacc: #ifdef _ENABLE_OPENACC_ acc_free(buffer); #endif break; } /* Free dummy compute related resources */ if (cpu == options.target || both == options.target) { free_host_arrays(); } if (gpu == options.target || both == options.target) { #ifdef _ENABLE_CUDA_KERNEL_ free_device_arrays(); #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */ } }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_copyin (h, N); if (acc_is_present (h, 1) != 1) abort (); if (acc_is_present (h, N + 1) != 0) abort (); if (acc_is_present (h + 1, N) != 0) abort (); if (acc_is_present (h - 1, N) != 0) abort (); if (acc_is_present (h - 1, N - 1) != 0) abort (); if (acc_is_present (h + N, 0) != 0) abort (); if (acc_is_present (h + N, N) != 0) abort (); if (acc_is_present (0, N) != 0) abort (); if (acc_is_present (h, 0) != 0) abort (); acc_free (d); if (acc_is_present (h, 1) != 0) abort (); free (h); return 0; }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; acc_init (acc_device_nvidia); h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_malloc (N); acc_memcpy_to_device (d, h, N); memset (&h[0], 0, N); acc_memcpy_to_device (d, h, N << 1); acc_memcpy_from_device (h, d, N); for (i = 0; i < N; i++) { if (h[i] != i) abort (); } acc_free (d); free (h); acc_shutdown (acc_device_nvidia); return 0; }
int search_GPU(int s) { int i; int find = -1; a = (float *) malloc(sizeof(float) * SIZE); c = (float)s-5; init(s); double start, finish, elapsed; start = (double) clock() / (CLOCKS_PER_SEC*1000); #pragma acc data copyin(a[0:s],c) copy(find) { #pragma acc kernels { #pragma acc loop independent { for (i = 0; i < s; ++i) { if(a[i] == c) { find = i; i=s; } } } } } acc_free(acc_deviceptr(a)); finish = (double) clock() / (CLOCKS_PER_SEC*1000); elapsed = finish - start; fprintf(out,"%.6lf,",elapsed); //print_result(s,find); free(a); return find; }
int free_device_buffer (void * buf) { switch (options.accel) { #ifdef _ENABLE_CUDA_ case cuda: cudaFree(buf); break; #endif #ifdef _ENABLE_OPENACC_ case openacc: acc_free(buf); break; #endif default: /* unknown device */ return 1; } return 0; }
int main (int argc, char **argv) { const int N = 256; unsigned char *h; void *d; h = (unsigned char *) malloc (N); d = acc_malloc (N); fprintf (stderr, "CheCKpOInT\n"); acc_map_data (h, d, 0); acc_unmap_data (h); acc_free (d); free (h); return 0; }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_malloc (N); fprintf (stderr, "CheCKpOInT\n"); acc_memcpy_to_device (0, h, N); memset (&h[0], 0, N); acc_memcpy_from_device (h, d, N); for (i = 0; i < N; i++) { if (h[i] != i) abort (); } acc_free (d); free (h); return 0; }
int main (int argc, char **argv) { const int N = 256; unsigned char *h; void *d; h = (unsigned char *) malloc (N); d = acc_malloc (N); acc_map_data (h, d, N); if (acc_is_present (h, N) != 1) abort (); acc_unmap_data (h); acc_free (d); free (h); return 0; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay; CUmodule module; CUresult r; CUstream stream; unsigned long *a, *d_a, dticks; int nbytes; float atime, dtime; void *kargs[2]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay, module, "delay"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = nprocs * sizeof (unsigned long); dtime = 200.0; dticks = (unsigned long) (dtime * clkrate); a = (unsigned long *) malloc (nbytes); d_a = (unsigned long *) acc_malloc (nbytes); acc_map_data (a, d_a, nbytes); kargs[0] = (void *) &d_a; kargs[1] = (void *) &dticks; r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } acc_set_cuda_stream (0, stream); init_timers (1); start_timer (0); r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } acc_wait (1); atime = stop_timer (0); if (atime < dtime) { fprintf (stderr, "actual time < delay time\n"); abort (); } start_timer (0); acc_wait (1); atime = stop_timer (0); if (0.010 < atime) { fprintf (stderr, "actual time < delay time\n"); abort (); } acc_unmap_data (a); fini_timers (); free (a); acc_free (d_a); acc_shutdown (acc_device_nvidia); return 0; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay; CUmodule module; CUresult r; CUstream stream; unsigned long *a, *d_a, dticks; int nbytes; float dtime; void *kargs[2]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay, module, "delay"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = nprocs * sizeof (unsigned long); dtime = 200.0; dticks = (unsigned long) (dtime * clkrate); a = (unsigned long *) malloc (nbytes); d_a = (unsigned long *) acc_malloc (nbytes); acc_map_data (a, d_a, nbytes); kargs[0] = (void *) &d_a; kargs[1] = (void *) &dticks; r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (0, stream)) abort (); r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } if (acc_async_test_all () != 0) { fprintf (stderr, "asynchronous operation not running\n"); abort (); } sleep ((int) (dtime / 1000.f) + 1); if (acc_async_test_all () != 1) { fprintf (stderr, "found asynchronous operation still running\n"); abort (); } acc_unmap_data (a); free (a); acc_free (d_a); acc_shutdown (acc_device_nvidia); exit (0); }
int main (int argc, char **argv) { cublasStatus_t s; cublasHandle_t h; CUcontext pctx; CUresult r; int i; const int N = 256; float *h_X, *h_Y1, *h_Y2; float *d_X,*d_Y; float alpha = 2.0f; float error_norm; float ref_norm; /* Test 4 - OpenACC creates, cuBLAS shares. */ acc_set_device_num (0, acc_device_nvidia); r = cuCtxGetCurrent (&pctx); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r); exit (EXIT_FAILURE); } h_X = (float *) malloc (N * sizeof (float)); if (h_X == 0) { fprintf (stderr, "malloc failed: for h_X\n"); exit (EXIT_FAILURE); } h_Y1 = (float *) malloc (N * sizeof (float)); if (h_Y1 == 0) { fprintf (stderr, "malloc failed: for h_Y1\n"); exit (EXIT_FAILURE); } h_Y2 = (float *) malloc (N * sizeof (float)); if (h_Y2 == 0) { fprintf (stderr, "malloc failed: for h_Y2\n"); exit (EXIT_FAILURE); } for (i = 0; i < N; i++) { h_X[i] = rand () / (float) RAND_MAX; h_Y2[i] = h_Y1[i] = rand () / (float) RAND_MAX; } #pragma acc parallel copyin (h_X[0:N]), copy (h_Y2[0:N]) copy (alpha) { int i; for (i = 0; i < N; i++) { h_Y2[i] = alpha * h_X[i] + h_Y2[i]; } } r = cuCtxGetCurrent (&pctx); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r); exit (EXIT_FAILURE); } d_X = (float *) acc_copyin (&h_X[0], N * sizeof (float)); if (d_X == NULL) { fprintf (stderr, "copyin error h_Y1\n"); exit (EXIT_FAILURE); } d_Y = (float *) acc_copyin (&h_Y1[0], N * sizeof (float)); if (d_Y == NULL) { fprintf (stderr, "copyin error h_Y1\n"); exit (EXIT_FAILURE); } s = cublasCreate (&h); if (s != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "cublasCreate failed: %d\n", s); exit (EXIT_FAILURE); } context_check (pctx); s = cublasSaxpy (h, N, &alpha, d_X, 1, d_Y, 1); if (s != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "cublasSaxpy failed: %d\n", s); exit (EXIT_FAILURE); } context_check (pctx); acc_memcpy_from_device (&h_Y1[0], d_Y, N * sizeof (float)); context_check (pctx); error_norm = 0; ref_norm = 0; for (i = 0; i < N; ++i) { float diff; diff = h_Y1[i] - h_Y2[i]; error_norm += diff * diff; ref_norm += h_Y2[i] * h_Y2[i]; } error_norm = (float) sqrt ((double) error_norm); ref_norm = (float) sqrt ((double) ref_norm); if ((fabs (ref_norm) < 1e-7) || ((error_norm / ref_norm) >= 1e-6f)) { fprintf (stderr, "math error\n"); exit (EXIT_FAILURE); } free (h_X); free (h_Y1); free (h_Y2); acc_free (d_X); acc_free (d_Y); context_check (pctx); s = cublasDestroy (h); if (s != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "cublasDestroy failed: %d\n", s); exit (EXIT_FAILURE); } context_check (pctx); acc_shutdown (acc_device_nvidia); r = cuCtxGetCurrent (&pctx); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r); exit (EXIT_FAILURE); } if (pctx) { fprintf (stderr, "Unexpected context\n"); exit (EXIT_FAILURE); } return EXIT_SUCCESS; }