int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h1, *h2; h1 = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h1[i] = 0xab; } (void) acc_copyin (h1, N); h2 = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h2[i] = 0xde; } (void) acc_copyin (h2, N); acc_copyout (h1, N + N); free (h1); free (h2); return 0; }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_copyin (h, N); acc_free (d); fprintf (stderr, "CheCKpOInT\n"); acc_copyout (h, N); free (h); return 0; }
static void * test (void *arg) { int i; if (acc_get_current_cuda_context () != NULL) abort (); if (acc_is_present (x, N) != 1) abort (); memset (x, 0, N); acc_copyout (x, N); for (i = 0; i < N; i++) { if (x[i] != i) abort (); x[i] = N - i - 1; } d_x = acc_copyin (x, N); return 0; }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_copyin (h, N); if (!d) abort (); memset (&h[0], 0, N); acc_update_self (0, N); for (i = 0; i < N; i++) { if (h[i] != i) abort (); } acc_delete (h, N); free (h); return 0; }
int main (int argc, char *argv[]) { int i; #pragma acc data present_or_copy (i) acc_copyin (&i, sizeof i); return 0; }
void foo (int *a, size_t n) { int *p = (int *)acc_copyin (&a, n); #pragma acc kernels deviceptr (p) pcopy(a[0:n]) { a = 0; *p = 1; } }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_copyin (h, N); if (acc_is_present (h, 1) != 1) abort (); if (acc_is_present (h, N + 1) != 0) abort (); if (acc_is_present (h + 1, N) != 0) abort (); if (acc_is_present (h - 1, N) != 0) abort (); if (acc_is_present (h - 1, N - 1) != 0) abort (); if (acc_is_present (h + N, 0) != 0) abort (); if (acc_is_present (h + N, N) != 0) abort (); if (acc_is_present (0, N) != 0) abort (); if (acc_is_present (h, 0) != 0) abort (); acc_free (d); if (acc_is_present (h, 1) != 0) abort (); free (h); return 0; }
void foo (void) { int a[N]; int *p = (int *)acc_copyin (&a[0], sizeof (a)); #pragma acc kernels deviceptr (p) pcopy(a) { a[0] = 0; *p = 1; } }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } (void) acc_copyin (h, N); (void) acc_copyin (h, N); free (h); return 0; }
void GPUCopy::copyIn(SimBox *sb) { h_moleculeData = sb->moleculeData; h_atomData = sb->atomData; h_atomCoordinates = sb->atomCoordinates; h_rollBackCoordinates = sb->rollBackCoordinates; h_size = sb-> size; h_primaryIndexes = sb->primaryIndexes; if (!parallel) { return; } #ifdef _OPENACC d_moleculeData = (int**)acc_malloc(MOL_DATA_SIZE * sizeof(int *)); assert(d_moleculeData != NULL); for (int row = 0; row < MOL_DATA_SIZE; row++) { int *h_moleculeData_row = sb->moleculeData[row]; int *d_moleculeData_row = (int *)acc_copyin(h_moleculeData_row, sb->numMolecules * sizeof(int)); assert(d_moleculeData_row != NULL); #pragma acc parallel deviceptr(d_moleculeData) d_moleculeData[row] = d_moleculeData_row; } d_atomData = (Real**)acc_malloc(ATOM_DATA_SIZE * sizeof(Real *)); assert(d_atomData != NULL); for (int row = 0; row < ATOM_DATA_SIZE; row++) { Real *h_atomData_row = sb->atomData[row]; Real *d_atomData_row = (Real *)acc_copyin(h_atomData_row, sb->numAtoms * sizeof(Real)); assert(d_atomData_row != NULL); #pragma acc parallel deviceptr(d_atomData) d_atomData[row] = d_atomData_row; } d_atomCoordinates = (Real**)acc_malloc(NUM_DIMENSIONS * sizeof(Real *)); assert(d_atomCoordinates != NULL); for (int row = 0; row < NUM_DIMENSIONS; row++) { Real *h_atomCoordinates_row = sb->atomCoordinates[row]; Real *d_atomCoordinates_row = (Real *)acc_copyin(h_atomCoordinates_row, sb->numAtoms * sizeof(Real)); assert(d_atomCoordinates_row != NULL); #pragma acc parallel deviceptr(d_atomCoordinates) d_atomCoordinates[row] = d_atomCoordinates_row; } d_rollBackCoordinates = (Real**)acc_malloc(NUM_DIMENSIONS * sizeof(Real *)); assert(d_rollBackCoordinates != NULL); for (int row = 0; row < NUM_DIMENSIONS; row++) { Real *h_rollBackCoordinates_row = sb->rollBackCoordinates[row]; Real *d_rollBackCoordinates_row = (Real *)acc_copyin(h_rollBackCoordinates_row, sb->largestMol * sizeof(Real)); assert(d_rollBackCoordinates_row != NULL); #pragma acc parallel deviceptr(d_rollBackCoordinates) d_rollBackCoordinates[row] = d_rollBackCoordinates_row; } d_primaryIndexes = (int *)acc_copyin(sb->primaryIndexes, sb->numPIdxes * sizeof(int)); d_size = (Real *)acc_copyin(sb->size, NUM_DIMENSIONS * sizeof(Real)); #endif }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_copyin (h, N); if (!d) abort (); for (i = 0; i < N; i++) { h[i] = 0xab; } fprintf (stderr, "CheCKpOInT\n"); acc_update_device (0, N); acc_copyout (h, N); for (i = 0; i < N; i++) { if (h[i] != 0xab) abort (); } free (h); return 0; }
int main (int argc, char **argv) { cublasStatus_t s; cublasHandle_t h; CUcontext pctx; CUresult r; int i; const int N = 256; float *h_X, *h_Y1, *h_Y2; float *d_X,*d_Y; float alpha = 2.0f; float error_norm; float ref_norm; /* Test 4 - OpenACC creates, cuBLAS shares. */ acc_set_device_num (0, acc_device_nvidia); r = cuCtxGetCurrent (&pctx); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r); exit (EXIT_FAILURE); } h_X = (float *) malloc (N * sizeof (float)); if (h_X == 0) { fprintf (stderr, "malloc failed: for h_X\n"); exit (EXIT_FAILURE); } h_Y1 = (float *) malloc (N * sizeof (float)); if (h_Y1 == 0) { fprintf (stderr, "malloc failed: for h_Y1\n"); exit (EXIT_FAILURE); } h_Y2 = (float *) malloc (N * sizeof (float)); if (h_Y2 == 0) { fprintf (stderr, "malloc failed: for h_Y2\n"); exit (EXIT_FAILURE); } for (i = 0; i < N; i++) { h_X[i] = rand () / (float) RAND_MAX; h_Y2[i] = h_Y1[i] = rand () / (float) RAND_MAX; } #pragma acc parallel copyin (h_X[0:N]), copy (h_Y2[0:N]) copy (alpha) { int i; for (i = 0; i < N; i++) { h_Y2[i] = alpha * h_X[i] + h_Y2[i]; } } r = cuCtxGetCurrent (&pctx); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r); exit (EXIT_FAILURE); } d_X = (float *) acc_copyin (&h_X[0], N * sizeof (float)); if (d_X == NULL) { fprintf (stderr, "copyin error h_Y1\n"); exit (EXIT_FAILURE); } d_Y = (float *) acc_copyin (&h_Y1[0], N * sizeof (float)); if (d_Y == NULL) { fprintf (stderr, "copyin error h_Y1\n"); exit (EXIT_FAILURE); } s = cublasCreate (&h); if (s != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "cublasCreate failed: %d\n", s); exit (EXIT_FAILURE); } context_check (pctx); s = cublasSaxpy (h, N, &alpha, d_X, 1, d_Y, 1); if (s != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "cublasSaxpy failed: %d\n", s); exit (EXIT_FAILURE); } context_check (pctx); acc_memcpy_from_device (&h_Y1[0], d_Y, N * sizeof (float)); context_check (pctx); error_norm = 0; ref_norm = 0; for (i = 0; i < N; ++i) { float diff; diff = h_Y1[i] - h_Y2[i]; error_norm += diff * diff; ref_norm += h_Y2[i] * h_Y2[i]; } error_norm = (float) sqrt ((double) error_norm); ref_norm = (float) sqrt ((double) ref_norm); if ((fabs (ref_norm) < 1e-7) || ((error_norm / ref_norm) >= 1e-6f)) { fprintf (stderr, "math error\n"); exit (EXIT_FAILURE); } free (h_X); free (h_Y1); free (h_Y2); acc_free (d_X); acc_free (d_Y); context_check (pctx); s = cublasDestroy (h); if (s != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "cublasDestroy failed: %d\n", s); exit (EXIT_FAILURE); } context_check (pctx); acc_shutdown (acc_device_nvidia); r = cuCtxGetCurrent (&pctx); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r); exit (EXIT_FAILURE); } if (pctx) { fprintf (stderr, "Unexpected context\n"); exit (EXIT_FAILURE); } return EXIT_SUCCESS; }
int main () { int *p = (int *)malloc (sizeof (int)); /* Test 1: pragma input, library output. */ #pragma acc enter data copyin (p[0:1]) #pragma acc parallel present (p[0:1]) num_gangs (1) { p[0] = 1; } acc_copyout (p, sizeof (int)); assert (p[0] == 1); /* Test 2: library input, pragma output. */ acc_copyin (p, sizeof (int)); #pragma acc parallel present (p[0:1]) num_gangs (1) { p[0] = 2; } #pragma acc exit data copyout (p[0:1]) assert (p[0] == 2); /* Test 3: library input, library output. */ acc_copyin (p, sizeof (int)); #pragma acc parallel present (p[0:1]) num_gangs (1) { p[0] = 3; } acc_copyout (p, sizeof (int)); assert (p[0] == 3); /* Test 4: pragma input, pragma output. */ #pragma acc enter data copyin (p[0:1]) #pragma acc parallel present (p[0:1]) num_gangs (1) { p[0] = 3; } #pragma acc exit data copyout (p[0:1]) assert (p[0] == 3); free (p); return 0; }
int main (int argc, char **argv) { const int nthreads = 1; int i; pthread_attr_t attr; pthread_t *tid; if (acc_get_num_devices (acc_device_nvidia) == 0) return 0; acc_init (acc_device_nvidia); x = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { x[i] = i; } d_x = acc_copyin (x, N); if (acc_is_present (x, N) != 1) abort (); if (pthread_attr_init (&attr) != 0) perror ("pthread_attr_init failed"); tid = (pthread_t *) malloc (nthreads * sizeof (pthread_t)); for (i = 0; i < nthreads; i++) { if (pthread_create (&tid[i], &attr, &test, (void *) (unsigned long) (i)) != 0) perror ("pthread_create failed"); } if (pthread_attr_destroy (&attr) != 0) perror ("pthread_attr_destroy failed"); for (i = 0; i < nthreads; i++) { void *res; if (pthread_join (tid[i], &res) != 0) perror ("pthread join failed"); } if (acc_is_present (x, N) != 1) abort (); memset (x, 0, N); acc_copyout (x, N); for (i = 0; i < N; i++) { if (x[i] != N - i - 1) abort (); } if (acc_is_present (x, N) != 0) abort (); acc_shutdown (acc_device_nvidia); return 0; }