void GPUCopy::copyOut(SimBox* sb) { if (!parallel) return; #ifdef _OPENACC for (int row = 0; row < MOL_DATA_SIZE; row++) { int *h_moleculeData_row = h_moleculeData[row]; acc_copyout(h_moleculeData_row, sb->numMolecules * sizeof(int)); } for (int row = 0; row < ATOM_DATA_SIZE; row++) { Real *h_atomData_row = h_atomData[row]; acc_copyout(h_atomData_row, sb->numAtoms * sizeof(Real)); } for (int row = 0; row < NUM_DIMENSIONS; row++) { Real *h_atomCoordinates_row = h_atomCoordinates[row]; acc_copyout(h_atomCoordinates_row, sb->numAtoms * sizeof(Real)); } for (int row = 0; row < NUM_DIMENSIONS; row++) { Real *h_rollBackCoordinates_row = h_rollBackCoordinates[row]; acc_copyout(h_rollBackCoordinates_row, sb->largestMol * sizeof(Real)); } acc_copyout(h_primaryIndexes, sb->numPIdxes); acc_copyout(h_size, NUM_DIMENSIONS); acc_copyout(h_angleSizes, sb->numAngles); acc_copyout(h_rollBackAngleSizes, sb->numAngles); acc_copyout(h_bondLengths, sb->numBonds); acc_copyout(h_rollBackBondLengths, sb->numBonds); #endif }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } (void) acc_copyin (h, N); acc_copyout (h, N); fprintf (stderr, "CheCKpOInT\n"); acc_copyout (h, N); free (h); return 0; }
static void * test (void *arg) { int i; if (acc_get_current_cuda_context () != NULL) abort (); if (acc_is_present (x, N) != 1) abort (); memset (x, 0, N); acc_copyout (x, N); for (i = 0; i < N; i++) { if (x[i] != i) abort (); x[i] = N - i - 1; } d_x = acc_copyin (x, N); return 0; }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h1, *h2; h1 = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h1[i] = 0xab; } (void) acc_copyin (h1, N); h2 = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h2[i] = 0xde; } (void) acc_copyin (h2, N); acc_copyout (h1, N + N); free (h1); free (h2); return 0; }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_copyin (h, N); acc_free (d); acc_copyout (h, N); free (h); return 0; }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } fprintf (stderr, "CheCKpOInT\n"); acc_update_device (h, N); acc_copyout (h, N); for (i = 0; i < N; i++) { if (h[i] != 0xab) abort (); } free (h); return 0; }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_present_or_copyin (h, 0); if (!d) abort (); memset (&h[0], 0, N); acc_copyout (h, N); for (i = 0; i < N; i++) { if (h[i] != i) abort (); } free (h); return 0; }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_copyin (h, N); if (!d) abort (); for (i = 0; i < N; i++) { h[i] = 0xab; } acc_update_device (h, N); acc_copyout (h, N); for (i = 0; i < N; i++) { if (h[i] != 0xab) abort (); } free (h); return 0; }
void GOACC_enter_exit_data (int device, const void *offload_table, size_t mapnum, void **hostaddrs, size_t *sizes, unsigned short *kinds, int async, int num_waits, ...) { struct goacc_thread *thr; struct gomp_device_descr *acc_dev; bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; bool data_enter = false; size_t i; select_acc_device (device); thr = goacc_thread (); acc_dev = thr->dev; if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) || host_fallback) return; if (num_waits > 0) { va_list ap; va_start (ap, num_waits); goacc_wait (async, num_waits, ap); va_end (ap); } acc_dev->openacc.async_set_async_func (async); /* Determine if this is an "acc enter data". */ for (i = 0; i < mapnum; ++i) { unsigned char kind = kinds[i] & 0xff; if (kind == GOMP_MAP_POINTER || kind == GOMP_MAP_TO_PSET) continue; if (kind == GOMP_MAP_FORCE_ALLOC || kind == GOMP_MAP_FORCE_PRESENT || kind == GOMP_MAP_FORCE_TO) { data_enter = true; break; } if (kind == GOMP_MAP_FORCE_DEALLOC || kind == GOMP_MAP_FORCE_FROM) break; gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x", kind); } if (data_enter) { for (i = 0; i < mapnum; i++) { unsigned char kind = kinds[i] & 0xff; /* Scan for PSETs. */ int psets = find_pset (i, mapnum, kinds); if (!psets) { switch (kind) { case GOMP_MAP_POINTER: gomp_acc_insert_pointer (1, &hostaddrs[i], &sizes[i], &kinds[i]); break; case GOMP_MAP_FORCE_ALLOC: acc_create (hostaddrs[i], sizes[i]); break; case GOMP_MAP_FORCE_PRESENT: acc_present_or_copyin (hostaddrs[i], sizes[i]); break; case GOMP_MAP_FORCE_TO: acc_present_or_copyin (hostaddrs[i], sizes[i]); break; default: gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x", kind); break; } } else { gomp_acc_insert_pointer (3, &hostaddrs[i], &sizes[i], &kinds[i]); /* Increment 'i' by two because OpenACC requires fortran arrays to be contiguous, so each PSET is associated with one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and one MAP_POINTER. */ i += 2; } } } else for (i = 0; i < mapnum; ++i) { unsigned char kind = kinds[i] & 0xff; int psets = find_pset (i, mapnum, kinds); if (!psets) { switch (kind) { case GOMP_MAP_POINTER: gomp_acc_remove_pointer (hostaddrs[i], (kinds[i] & 0xff) == GOMP_MAP_FORCE_FROM, async, 1); break; case GOMP_MAP_FORCE_DEALLOC: acc_delete (hostaddrs[i], sizes[i]); break; case GOMP_MAP_FORCE_FROM: acc_copyout (hostaddrs[i], sizes[i]); break; default: gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x", kind); break; } } else { gomp_acc_remove_pointer (hostaddrs[i], (kinds[i] & 0xff) == GOMP_MAP_FORCE_FROM, async, 3); /* See the above comment. */ i += 2; } } acc_dev->openacc.async_set_async_func (acc_async_sync); }
int main () { int *p = (int *)malloc (sizeof (int)); /* Test 1: pragma input, library output. */ #pragma acc enter data copyin (p[0:1]) #pragma acc parallel present (p[0:1]) num_gangs (1) { p[0] = 1; } acc_copyout (p, sizeof (int)); assert (p[0] == 1); /* Test 2: library input, pragma output. */ acc_copyin (p, sizeof (int)); #pragma acc parallel present (p[0:1]) num_gangs (1) { p[0] = 2; } #pragma acc exit data copyout (p[0:1]) assert (p[0] == 2); /* Test 3: library input, library output. */ acc_copyin (p, sizeof (int)); #pragma acc parallel present (p[0:1]) num_gangs (1) { p[0] = 3; } acc_copyout (p, sizeof (int)); assert (p[0] == 3); /* Test 4: pragma input, pragma output. */ #pragma acc enter data copyin (p[0:1]) #pragma acc parallel present (p[0:1]) num_gangs (1) { p[0] = 3; } #pragma acc exit data copyout (p[0:1]) assert (p[0] == 3); free (p); return 0; }
int main (int argc, char **argv) { const int nthreads = 1; int i; pthread_attr_t attr; pthread_t *tid; if (acc_get_num_devices (acc_device_nvidia) == 0) return 0; acc_init (acc_device_nvidia); x = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { x[i] = i; } d_x = acc_copyin (x, N); if (acc_is_present (x, N) != 1) abort (); if (pthread_attr_init (&attr) != 0) perror ("pthread_attr_init failed"); tid = (pthread_t *) malloc (nthreads * sizeof (pthread_t)); for (i = 0; i < nthreads; i++) { if (pthread_create (&tid[i], &attr, &test, (void *) (unsigned long) (i)) != 0) perror ("pthread_create failed"); } if (pthread_attr_destroy (&attr) != 0) perror ("pthread_attr_destroy failed"); for (i = 0; i < nthreads; i++) { void *res; if (pthread_join (tid[i], &res) != 0) perror ("pthread join failed"); } if (acc_is_present (x, N) != 1) abort (); memset (x, 0, N); acc_copyout (x, N); for (i = 0; i < N; i++) { if (x[i] != N - i - 1) abort (); } if (acc_is_present (x, N) != 0) abort (); acc_shutdown (acc_device_nvidia); return 0; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay2; CUmodule module; CUresult r; int N; int i; CUstream *streams; unsigned long **a, **d_a, *tid, ticks; int nbytes; void *kargs[3]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay2, module, "delay2"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = sizeof (int); ticks = (unsigned long) (200.0 * clkrate); N = nprocs; streams = (CUstream *) malloc (N * sizeof (void *)); a = (unsigned long **) malloc (N * sizeof (unsigned long *)); d_a = (unsigned long **) malloc (N * sizeof (unsigned long *)); tid = (unsigned long *) malloc (N * sizeof (unsigned long)); for (i = 0; i < N; i++) { a[i] = (unsigned long *) malloc (sizeof (unsigned long)); *a[i] = N; d_a[i] = (unsigned long *) acc_malloc (nbytes); tid[i] = i; acc_map_data (a[i], d_a[i], nbytes); streams[i] = (CUstream) acc_get_cuda_stream (i); if (streams[i] != NULL) abort (); r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (i, streams[i])) abort (); } for (i = 0; i < N; i++) { kargs[0] = (void *) &d_a[i]; kargs[1] = (void *) &ticks; kargs[2] = (void *) &tid[i]; r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } ticks = (unsigned long) (50.0 * clkrate); } acc_wait_all_async (0); for (i = 0; i < N; i++) { acc_copyout (a[i], nbytes); if (*a[i] != i) abort (); } free (streams); for (i = 0; i < N; i++) { free (a[i]); } free (a); free (d_a); free (tid); acc_shutdown (acc_device_nvidia); exit (0); }