Пример #1
0
void GPUCopy::copyOut(SimBox* sb) {
    if (!parallel) return;

#ifdef _OPENACC
    for (int row = 0; row < MOL_DATA_SIZE; row++) {
        int *h_moleculeData_row = h_moleculeData[row];
        acc_copyout(h_moleculeData_row, sb->numMolecules * sizeof(int));
    }

    for (int row = 0; row < ATOM_DATA_SIZE; row++) {
        Real *h_atomData_row = h_atomData[row];
        acc_copyout(h_atomData_row, sb->numAtoms * sizeof(Real));
    }

    for (int row = 0; row < NUM_DIMENSIONS; row++) {
        Real *h_atomCoordinates_row = h_atomCoordinates[row];
        acc_copyout(h_atomCoordinates_row, sb->numAtoms * sizeof(Real));
    }

    for (int row = 0; row < NUM_DIMENSIONS; row++) {
        Real *h_rollBackCoordinates_row = h_rollBackCoordinates[row];
        acc_copyout(h_rollBackCoordinates_row, sb->largestMol * sizeof(Real));
    }

    acc_copyout(h_primaryIndexes, sb->numPIdxes);
    acc_copyout(h_size, NUM_DIMENSIONS);

    acc_copyout(h_angleSizes, sb->numAngles);
    acc_copyout(h_rollBackAngleSizes, sb->numAngles);
    acc_copyout(h_bondLengths, sb->numBonds);
    acc_copyout(h_rollBackBondLengths, sb->numBonds);
#endif
}
Пример #2
0
int
main (int argc, char **argv)
{
  const int N = 256;
  int i;
  unsigned char *h;

  h = (unsigned char *) malloc (N);

  for (i = 0; i < N; i++)
    {
      h[i] = i;
    }

  (void) acc_copyin (h, N);

  acc_copyout (h, N);

  fprintf (stderr, "CheCKpOInT\n");
  acc_copyout (h, N);

  free (h);

  return 0;
}
Пример #3
0
static void *
test (void *arg)
{
  int i;

  if (acc_get_current_cuda_context () != NULL)
    abort ();

  if (acc_is_present (x, N) != 1)
    abort ();

  memset (x, 0, N);

  acc_copyout (x, N);

  for (i = 0; i < N; i++)
    {
      if (x[i] != i)
	abort ();

      x[i] = N - i - 1;
    }

  d_x = acc_copyin (x, N);

  return 0;
}
Пример #4
0
int
main (int argc, char **argv)
{
  const int N = 256;
  int i;
  unsigned char *h1, *h2;

  h1 = (unsigned char *) malloc (N);

  for (i = 0; i < N; i++)
    {
      h1[i] = 0xab;
    }

  (void) acc_copyin (h1, N);

  h2 = (unsigned char *) malloc (N);

  for (i = 0; i < N; i++)
    {
      h2[i] = 0xde;
    }

  (void) acc_copyin (h2, N);

  acc_copyout (h1, N + N);

  free (h1);
  free (h2);

  return 0;
}
Пример #5
0
int
main (int argc, char **argv)
{
  const int N = 256;
  int i;
  unsigned char *h;
  void *d;

  h = (unsigned char *) malloc (N);

  for (i = 0; i < N; i++)
    {
      h[i] = i;
    }

  d = acc_copyin (h, N);

  acc_free (d);

  acc_copyout (h, N);

  free (h);

  return 0;
}
Пример #6
0
int
main (int argc, char **argv)
{
  const int N = 256;
  int i;
  unsigned char *h;

  h = (unsigned char *) malloc (N);

  for (i = 0; i < N; i++)
    {
      h[i] = i;
    }

  fprintf (stderr, "CheCKpOInT\n");
  acc_update_device (h, N);

  acc_copyout (h, N);

  for (i = 0; i < N; i++)
    {
      if (h[i] != 0xab)
	abort ();
    }

  free (h);

  return 0;
}
Пример #7
0
int
main (int argc, char **argv)
{
  const int N = 256;
  int i;
  unsigned char *h;
  void *d;

  h = (unsigned char *) malloc (N);

  for (i = 0; i < N; i++)
    {
      h[i] = i;
    }

  d = acc_present_or_copyin (h, 0);
  if (!d)
    abort ();

  memset (&h[0], 0, N);

  acc_copyout (h, N);

  for (i = 0; i < N; i++)
    {
      if (h[i] != i)
	abort ();
    }

  free (h);

  return 0;
}
Пример #8
0
int
main (int argc, char **argv)
{
  const int N = 256;
  int i;
  unsigned char *h;
  void *d;

  h = (unsigned char *) malloc (N);

  for (i = 0; i < N; i++)
    {
      h[i] = i;
    }

  d = acc_copyin (h, N);
  if (!d)
    abort ();

  for (i = 0; i < N; i++)
    {
      h[i] = 0xab;
    }

  acc_update_device (h, N);

  acc_copyout (h, N);

  for (i = 0; i < N; i++)
    {
      if (h[i] != 0xab)
	abort ();
    }

  free (h);

  return 0;
}
Пример #9
0
void
GOACC_enter_exit_data (int device, const void *offload_table, size_t mapnum,
		       void **hostaddrs, size_t *sizes, unsigned short *kinds,
		       int async, int num_waits, ...)
{
  struct goacc_thread *thr;
  struct gomp_device_descr *acc_dev;
  bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
  bool data_enter = false;
  size_t i;

  select_acc_device (device);

  thr = goacc_thread ();
  acc_dev = thr->dev;

  if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
      || host_fallback)
    return;

  if (num_waits > 0)
    {
      va_list ap;

      va_start (ap, num_waits);

      goacc_wait (async, num_waits, ap);

      va_end (ap);
    }

  acc_dev->openacc.async_set_async_func (async);

  /* Determine if this is an "acc enter data".  */
  for (i = 0; i < mapnum; ++i)
    {
      unsigned char kind = kinds[i] & 0xff;

      if (kind == GOMP_MAP_POINTER || kind == GOMP_MAP_TO_PSET)
	continue;

      if (kind == GOMP_MAP_FORCE_ALLOC
	  || kind == GOMP_MAP_FORCE_PRESENT
	  || kind == GOMP_MAP_FORCE_TO)
	{
	  data_enter = true;
	  break;
	}

      if (kind == GOMP_MAP_FORCE_DEALLOC
	  || kind == GOMP_MAP_FORCE_FROM)
	break;

      gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
		      kind);
    }

  if (data_enter)
    {
      for (i = 0; i < mapnum; i++)
	{
	  unsigned char kind = kinds[i] & 0xff;

	  /* Scan for PSETs.  */
	  int psets = find_pset (i, mapnum, kinds);

	  if (!psets)
	    {
	      switch (kind)
		{
		case GOMP_MAP_POINTER:
		  gomp_acc_insert_pointer (1, &hostaddrs[i], &sizes[i],
					&kinds[i]);
		  break;
		case GOMP_MAP_FORCE_ALLOC:
		  acc_create (hostaddrs[i], sizes[i]);
		  break;
		case GOMP_MAP_FORCE_PRESENT:
		  acc_present_or_copyin (hostaddrs[i], sizes[i]);
		  break;
		case GOMP_MAP_FORCE_TO:
		  acc_present_or_copyin (hostaddrs[i], sizes[i]);
		  break;
		default:
		  gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
			      kind);
		  break;
		}
	    }
	  else
	    {
	      gomp_acc_insert_pointer (3, &hostaddrs[i], &sizes[i], &kinds[i]);
	      /* Increment 'i' by two because OpenACC requires fortran
		 arrays to be contiguous, so each PSET is associated with
		 one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and
		 one MAP_POINTER.  */
	      i += 2;
	    }
	}
    }
  else
    for (i = 0; i < mapnum; ++i)
      {
	unsigned char kind = kinds[i] & 0xff;

	int psets = find_pset (i, mapnum, kinds);

	if (!psets)
	  {
	    switch (kind)
	      {
	      case GOMP_MAP_POINTER:
		gomp_acc_remove_pointer (hostaddrs[i], (kinds[i] & 0xff)
					 == GOMP_MAP_FORCE_FROM,
					 async, 1);
		break;
	      case GOMP_MAP_FORCE_DEALLOC:
		acc_delete (hostaddrs[i], sizes[i]);
		break;
	      case GOMP_MAP_FORCE_FROM:
		acc_copyout (hostaddrs[i], sizes[i]);
		break;
	      default:
		gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
			    kind);
		break;
	      }
	  }
	else
	  {
	    gomp_acc_remove_pointer (hostaddrs[i], (kinds[i] & 0xff)
				     == GOMP_MAP_FORCE_FROM, async, 3);
	    /* See the above comment.  */
	    i += 2;
	  }
      }

  acc_dev->openacc.async_set_async_func (acc_async_sync);
}
Пример #10
0
int
main ()
{
  int *p = (int *)malloc (sizeof (int));

  /* Test 1: pragma input, library output.  */
  
#pragma acc enter data copyin (p[0:1])

#pragma acc parallel present (p[0:1]) num_gangs (1)
  {
    p[0] = 1;
  }

  acc_copyout (p, sizeof (int));

  assert (p[0] == 1);
  
  /* Test 2: library input, pragma output.  */

  acc_copyin (p, sizeof (int));

#pragma acc parallel present (p[0:1]) num_gangs (1)
  {
    p[0] = 2;
  }

#pragma acc exit data copyout (p[0:1])
  
  assert (p[0] == 2);

  /* Test 3: library input, library output.  */

  acc_copyin (p, sizeof (int));

#pragma acc parallel present (p[0:1]) num_gangs (1)
  {
    p[0] = 3;
  }

  acc_copyout (p, sizeof (int));
  
  assert (p[0] == 3);

  /* Test 4: pragma input, pragma output.  */

#pragma acc enter data copyin (p[0:1])
  
#pragma acc parallel present (p[0:1]) num_gangs (1)
  {
    p[0] = 3;
  }

#pragma acc exit data copyout (p[0:1])
  
  assert (p[0] == 3);
  
  free (p);

  return 0;
}
Пример #11
0
int
main (int argc, char **argv)
{
  const int nthreads = 1;
  int i;
  pthread_attr_t attr;
  pthread_t *tid;

  if (acc_get_num_devices (acc_device_nvidia) == 0)
    return 0;

  acc_init (acc_device_nvidia);

  x = (unsigned char *) malloc (N);

  for (i = 0; i < N; i++)
    {
      x[i] = i;
    }

  d_x = acc_copyin (x, N);

  if (acc_is_present (x, N) != 1)
    abort ();

  if (pthread_attr_init (&attr) != 0)
    perror ("pthread_attr_init failed");

  tid = (pthread_t *) malloc (nthreads * sizeof (pthread_t));

  for (i = 0; i < nthreads; i++)
    {
      if (pthread_create (&tid[i], &attr, &test, (void *) (unsigned long) (i))
	  != 0)
	perror ("pthread_create failed");
    }

  if (pthread_attr_destroy (&attr) != 0)
    perror ("pthread_attr_destroy failed");

  for (i = 0; i < nthreads; i++)
    {
      void *res;

      if (pthread_join (tid[i], &res) != 0)
	perror ("pthread join failed");
    }

  if (acc_is_present (x, N) != 1)
    abort ();

  memset (x, 0, N);

  acc_copyout (x, N);

  for (i = 0; i < N; i++)
    {
      if (x[i] != N - i - 1)
	abort ();
    }

  if (acc_is_present (x, N) != 0)
    abort ();

  acc_shutdown (acc_device_nvidia);

  return 0;
}
Пример #12
0
int
main (int argc, char **argv)
{
  CUdevice dev;
  CUfunction delay2;
  CUmodule module;
  CUresult r;
  int N;
  int i;
  CUstream *streams;
  unsigned long **a, **d_a, *tid, ticks;
  int nbytes;
  void *kargs[3];
  int clkrate;
  int devnum, nprocs;

  acc_init (acc_device_nvidia);

  devnum = acc_get_device_num (acc_device_nvidia);

  r = cuDeviceGet (&dev, devnum);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
      abort ();
    }

  r =
    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
			  dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuModuleLoad (&module, "subr.ptx");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
      abort ();
    }

  r = cuModuleGetFunction (&delay2, module, "delay2");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
      abort ();
    }

  nbytes = sizeof (int);

  ticks = (unsigned long) (200.0 * clkrate);

  N = nprocs;

  streams = (CUstream *) malloc (N * sizeof (void *));

  a = (unsigned long **) malloc (N * sizeof (unsigned long *));
  d_a = (unsigned long **) malloc (N * sizeof (unsigned long *));
  tid = (unsigned long *) malloc (N * sizeof (unsigned long));

  for (i = 0; i < N; i++)
    {
      a[i] = (unsigned long *) malloc (sizeof (unsigned long));
      *a[i] = N;
      d_a[i] = (unsigned long *) acc_malloc (nbytes);
      tid[i] = i;

      acc_map_data (a[i], d_a[i], nbytes);

      streams[i] = (CUstream) acc_get_cuda_stream (i);
      if (streams[i] != NULL)
        abort ();

      r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT);
      if (r != CUDA_SUCCESS)
        {
          fprintf (stderr, "cuStreamCreate failed: %d\n", r);
          abort ();
        }

       if (!acc_set_cuda_stream (i, streams[i]))
        abort ();
    }

  for (i = 0; i < N; i++)
    {
      kargs[0] = (void *) &d_a[i];
      kargs[1] = (void *) &ticks;
      kargs[2] = (void *) &tid[i];

      r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
      if (r != CUDA_SUCCESS)
	{
	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
	  abort ();
	}

      ticks = (unsigned long) (50.0 * clkrate);
    }

  acc_wait_all_async (0);

  for (i = 0; i < N; i++)
    {
      acc_copyout (a[i], nbytes);
      if (*a[i] != i)
	abort ();
    }

  free (streams);

  for (i = 0; i < N; i++)
    {
      free (a[i]);
    }

  free (a);
  free (d_a);
  free (tid);

  acc_shutdown (acc_device_nvidia);

  exit (0);
}